From b226c1227bcf6412076ecf787421135fd2c42ef0 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Tue, 4 Jun 2024 19:21:26 +0800
Subject: [PATCH 01/39] refine .gitignore (#7688)

This adds tags and android ndk into the git ignore list
---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index 049efd703..5223c6963 100644
--- a/.gitignore
+++ b/.gitignore
@@ -34,9 +34,11 @@ ggml-metal-embed.metal
 lcov-report/
 gcovr-report/
 
+tags
 build*
 !build.zig
 cmake-build-*
+android-ndk-*
 out/
 tmp/
 

From 987d743d6bc4cee4bde6820733ea33a2abc0afac Mon Sep 17 00:00:00 2001
From: Daniele <57776841+daniandtheweb@users.noreply.github.com>
Date: Tue, 4 Jun 2024 12:09:15 +0000
Subject: [PATCH 02/39] Improve hipBLAS support in CMake (#7696)

* Improve hipBLAS support in CMake

This improves the detection of the correct CMAKE_PREFIX_PATH when using different distributions or a self-built ROCm SDK.

* Set ROCM_PATH correctly
---
 CMakeLists.txt | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 620305ca7..76ea27412 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -557,12 +557,17 @@ if (LLAMA_VULKAN)
 endif()
 
 if (LLAMA_HIPBLAS)
-    if ($ENV{ROCM_PATH})
-        set(ROCM_PATH $ENV{ROCM_PATH})
+    if (NOT EXISTS $ENV{ROCM_PATH})
+        if (NOT EXISTS /opt/rocm)
+            set(ROCM_PATH /usr)
+        else()
+            set(ROCM_PATH /opt/rocm)
+        endif()
     else()
-        set(ROCM_PATH /opt/rocm)
+        set(ROCM_PATH $ENV{ROCM_PATH})
     endif()
     list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
+    list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake")
 
     # CMake on Windows doesn't support the HIP language yet
     if(WIN32)

From adc9ff384121f4d550d28638a646b336d051bf42 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Tue, 4 Jun 2024 14:32:42 +0200
Subject: [PATCH 03/39] llama-bench : allow using a different printer for
 stderr with -oe (#7722)

compare-commits.sh : hide stdout, use -oe to print markdown
---
 examples/llama-bench/llama-bench.cpp | 145 +++++++++++++++++----------
 scripts/compare-commits.sh           |  16 +--
 2 files changed, 101 insertions(+), 60 deletions(-)

diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index c00890447..5d3cbd842 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -140,10 +140,11 @@ static std::string get_gpu_info() {
 }
 
 // command line params
-enum output_formats {CSV, JSON, MARKDOWN, SQL};
+enum output_formats {NONE, CSV, JSON, MARKDOWN, SQL};
 
 static const char * output_format_str(output_formats format) {
     switch (format) {
+        case NONE:     return "none";
         case CSV:      return "csv";
         case JSON:     return "json";
         case MARKDOWN: return "md";
@@ -152,6 +153,23 @@ static const char * output_format_str(output_formats format) {
     }
 }
 
+static bool output_format_from_str(const std::string & s, output_formats & format) {
+    if (s == "none") {
+        format = NONE;
+    } else if (s == "csv") {
+        format = CSV;
+    } else if (s == "json") {
+        format = JSON;
+    } else if (s == "md") {
+        format = MARKDOWN;
+    } else if (s == "sql") {
+        format = SQL;
+    } else {
+        return false;
+    }
+    return true;
+}
+
 static const char * split_mode_str(llama_split_mode mode) {
     switch (mode) {
         case LLAMA_SPLIT_MODE_NONE:  return "none";
@@ -190,31 +208,33 @@ struct cmd_params {
     int reps;
     bool verbose;
     output_formats output_format;
+    output_formats output_format_stderr;
 };
 
 static const cmd_params cmd_params_defaults = {
-    /* model         */ {"models/7B/ggml-model-q4_0.gguf"},
-    /* n_prompt      */ {512},
-    /* n_gen         */ {128},
-    /* n_pg          */ {},
-    /* n_batch       */ {2048},
-    /* n_ubatch      */ {512},
-    /* type_k        */ {GGML_TYPE_F16},
-    /* type_v        */ {GGML_TYPE_F16},
-    /* n_threads     */ {cpu_get_num_math()},
-    /* n_gpu_layers  */ {99},
-    /* rpc_servers   */ {""},
-    /* split_mode    */ {LLAMA_SPLIT_MODE_LAYER},
-    /* main_gpu      */ {0},
-    /* no_kv_offload */ {false},
-    /* flash_attn    */ {false},
-    /* tensor_split  */ {std::vector<float>(llama_max_devices(), 0.0f)},
-    /* use_mmap      */ {true},
-    /* embeddings    */ {false},
-    /* numa          */ GGML_NUMA_STRATEGY_DISABLED,
-    /* reps          */ 5,
-    /* verbose       */ false,
-    /* output_format */ MARKDOWN
+    /* model                */ {"models/7B/ggml-model-q4_0.gguf"},
+    /* n_prompt             */ {512},
+    /* n_gen                */ {128},
+    /* n_pg                 */ {},
+    /* n_batch              */ {2048},
+    /* n_ubatch             */ {512},
+    /* type_k               */ {GGML_TYPE_F16},
+    /* type_v               */ {GGML_TYPE_F16},
+    /* n_threads            */ {cpu_get_num_math()},
+    /* n_gpu_layers         */ {99},
+    /* rpc_servers          */ {""},
+    /* split_mode           */ {LLAMA_SPLIT_MODE_LAYER},
+    /* main_gpu             */ {0},
+    /* no_kv_offload        */ {false},
+    /* flash_attn           */ {false},
+    /* tensor_split         */ {std::vector<float>(llama_max_devices(), 0.0f)},
+    /* use_mmap             */ {true},
+    /* embeddings           */ {false},
+    /* numa                 */ GGML_NUMA_STRATEGY_DISABLED,
+    /* reps                 */ 5,
+    /* verbose              */ false,
+    /* output_format        */ MARKDOWN,
+    /* output_format_stderr */ NONE,
 };
 
 static void print_usage(int /* argc */, char ** argv) {
@@ -243,6 +263,7 @@ static void print_usage(int /* argc */, char ** argv) {
     printf("  -ts, --tensor-split <ts0/ts1/..>    (default: 0)\n");
     printf("  -r, --repetitions <n>               (default: %d)\n", cmd_params_defaults.reps);
     printf("  -o, --output <csv|json|md|sql>      (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
+    printf("  -oe, --output-err <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
     printf("  -v, --verbose                       (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
     printf("\n");
     printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
@@ -284,6 +305,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
 
     params.verbose = cmd_params_defaults.verbose;
     params.output_format = cmd_params_defaults.output_format;
+    params.output_format_stderr = cmd_params_defaults.output_format_stderr;
     params.reps = cmd_params_defaults.reps;
 
     for (int i = 1; i < argc; i++) {
@@ -493,18 +515,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                 invalid_param = true;
                 break;
             }
-            if (argv[i] == std::string("csv")) {
-                params.output_format = CSV;
-            } else if (argv[i] == std::string("json")) {
-                params.output_format = JSON;
-            } else if (argv[i] == std::string("md")) {
-                params.output_format = MARKDOWN;
-            } else if (argv[i] == std::string("sql")) {
-                params.output_format = SQL;
-            } else {
+            invalid_param = !output_format_from_str(argv[i], params.output_format);
+        } else if (arg == "-oe" || arg == "--output-err") {
+            if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
+            invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
         } else if (arg == "-v" || arg == "--verbose") {
             params.verbose = true;
         } else {
@@ -1278,6 +1295,22 @@ static void llama_null_log_callback(enum ggml_log_level level, const char * text
     (void) user_data;
 }
 
+static std::unique_ptr<printer> create_printer(output_formats format) {
+    switch (format) {
+        case NONE:
+            return nullptr;
+        case CSV:
+            return std::unique_ptr<printer>(new csv_printer());
+        case JSON:
+            return std::unique_ptr<printer>(new json_printer());
+        case MARKDOWN:
+            return std::unique_ptr<printer>(new markdown_printer());
+        case SQL:
+            return std::unique_ptr<printer>(new sql_printer());
+    }
+    GGML_ASSERT(false);
+}
+
 int main(int argc, char ** argv) {
     // try to set locale for unicode characters in markdown
     setlocale(LC_CTYPE, ".UTF-8");
@@ -1304,26 +1337,18 @@ int main(int argc, char ** argv) {
     llama_numa_init(params.numa);
 
     // initialize printer
-    std::unique_ptr<printer> p;
-    switch (params.output_format) {
-        case CSV:
-            p.reset(new csv_printer());
-            break;
-        case JSON:
-            p.reset(new json_printer());
-            break;
-        case MARKDOWN:
-            p.reset(new markdown_printer());
-            break;
-        case SQL:
-            p.reset(new sql_printer());
-            break;
-        default:
-            assert(false);
-            exit(1);
+    std::unique_ptr<printer> p = create_printer(params.output_format);
+    std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
+
+    if (p) {
+        p->fout = stdout;
+        p->print_header(params);
+    }
+
+    if (p_err) {
+        p_err->fout = stderr;
+        p_err->print_header(params);
     }
-    p->fout = stdout;
-    p->print_header(params);
 
     std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params);
 
@@ -1381,7 +1406,15 @@ int main(int argc, char ** argv) {
             t.samples_ns.push_back(t_ns);
         }
 
-        p->print_test(t);
+        if (p) {
+            p->print_test(t);
+            fflush(p->fout);
+        }
+
+        if (p_err) {
+            p_err->print_test(t);
+            fflush(p_err->fout);
+        }
 
         llama_print_timings(ctx);
 
@@ -1390,7 +1423,13 @@ int main(int argc, char ** argv) {
 
     llama_free_model(lmodel);
 
-    p->print_footer();
+    if (p) {
+        p->print_footer();
+    }
+
+    if (p_err) {
+        p_err->print_footer();
+    }
 
     llama_backend_free();
 
diff --git a/scripts/compare-commits.sh b/scripts/compare-commits.sh
index fd0ee88b2..a45cd3962 100755
--- a/scripts/compare-commits.sh
+++ b/scripts/compare-commits.sh
@@ -10,16 +10,18 @@ set -x
 
 bench_args="${@:3}"
 
-rm -f llama-bench.sqlite
+rm -f llama-bench.sqlite > /dev/null
 
 # to test a backend, call the script with the corresponding environment variable (e.g. LLAMA_CUDA=1 ./scripts/compare-commits.sh ...)
 
-git checkout $1
-make clean && make -j32 $make_opts llama-bench
-./llama-bench -o sql $bench_args | tee /dev/tty | sqlite3 llama-bench.sqlite
+git checkout $1 > /dev/null
+make clean > /dev/null
+make -j$(nproc) $make_opts llama-bench > /dev/null
+./llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite
 
-git checkout $2
-make clean && make -j32 $make_opts llama-bench
-./llama-bench -o sql $bench_args | tee /dev/tty | sqlite3 llama-bench.sqlite
+git checkout $2 > /dev/null
+make clean > /dev/null
+make -j$(nproc) $make_opts llama-bench > /dev/null
+./llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite
 
 ./scripts/compare-llama-bench.py -b $1 -c $2

From 5ca0944a153b65724d51b2f484139aa25ccb7a8b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 4 Jun 2024 19:43:01 +0300
Subject: [PATCH 04/39] readme : remove obsolete Zig instructions (#7471)

---
 README.md | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/README.md b/README.md
index 8680460aa..1f2d9b1f2 100644
--- a/README.md
+++ b/README.md
@@ -364,17 +364,6 @@ In order to build llama.cpp you have four different options.
       cmake --build build --config Debug
       ```
 
-- Using `Zig` (version 0.11 or later):
-
-    Building for optimization levels and CPU features can be accomplished using standard build arguments, for example AVX2, FMA, F16C,
-    it's also possible to cross compile for other operating systems and architectures:
-
-    ```bash
-    zig build -Doptimize=ReleaseFast -Dtarget=x86_64-windows-gnu -Dcpu=x86_64+avx2+fma+f16c
-    ```
-
-    The `zig targets` command will give you valid options to use.
-
 -   Using `gmake` (FreeBSD):
 
     1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)

From 0cd6bd3483fa66124b76a8a8ac794d9ee18c70c1 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 4 Jun 2024 21:23:05 +0300
Subject: [PATCH 05/39] llama : remove beam search (#7736)

---
 Makefile                             |   6 +-
 examples/CMakeLists.txt              |   1 -
 examples/beam-search/CMakeLists.txt  |   5 -
 examples/beam-search/beam-search.cpp | 188 --------------------
 llama.cpp                            | 254 ---------------------------
 llama.h                              |  42 +----
 6 files changed, 2 insertions(+), 494 deletions(-)
 delete mode 100644 examples/beam-search/CMakeLists.txt
 delete mode 100644 examples/beam-search/beam-search.cpp

diff --git a/Makefile b/Makefile
index b527f6f35..27eb69871 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 # Define the default target now so that it is always the first target
 BUILD_TARGETS = \
 	main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
-	simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search  \
+	simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
 	retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
 
 # Binaries only useful for tests
@@ -914,10 +914,6 @@ baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) tra
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
-	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
 finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index b40ee4ccb..53002f8e1 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -15,7 +15,6 @@ else()
     add_subdirectory(baby-llama)
     add_subdirectory(batched)
     add_subdirectory(batched-bench)
-    add_subdirectory(beam-search)
     add_subdirectory(benchmark)
     add_subdirectory(convert-llama2c-to-ggml)
     add_subdirectory(embedding)
diff --git a/examples/beam-search/CMakeLists.txt b/examples/beam-search/CMakeLists.txt
deleted file mode 100644
index f0e37468b..000000000
--- a/examples/beam-search/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-set(TARGET beam-search)
-add_executable(${TARGET} beam-search.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/beam-search/beam-search.cpp b/examples/beam-search/beam-search.cpp
deleted file mode 100644
index 3d34378a5..000000000
--- a/examples/beam-search/beam-search.cpp
+++ /dev/null
@@ -1,188 +0,0 @@
-#include "common.h"
-#include "llama.h"
-
-#include <cassert>
-#include <cinttypes>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <fstream>
-#include <iostream>
-#include <string>
-#include <vector>
-
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
-#include <signal.h>
-#include <unistd.h>
-#elif defined (_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#ifndef NOMINMAX
-#   define NOMINMAX
-#endif
-#include <windows.h>
-#include <signal.h>
-#endif
-
-// Used for debugging to print out beam tokens.
-struct ostream_beam_view {
-    llama_context * ctx;
-    llama_beam_view beam_view;
-};
-
-static std::ostream & operator<<(std::ostream & os, const ostream_beam_view & obv) {
-    os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
-    for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
-        os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
-    }
-    return os << ')';
-}
-
-// Put here anything you want back in beam_search_callback().
-struct beam_search_callback_data {
-    llama_context * ctx;
-    std::vector<llama_token> response;
-};
-
-// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
-// For example, eob can be flagged due to maximum token length, stop words, etc.
-static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
-    return n_tokens && llama_token_is_eog(llama_get_model(callback_data.ctx), tokens[n_tokens-1]);
-}
-
-// Function matching type llama_beam_search_callback_fn_t.
-// Custom callback example is called each time the beams lengths increase:
-//  * Show progress by printing ',' following by number of convergent beam tokens if any.
-//  * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
-//    This is also called when the stop condition is met.
-//    Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
-static void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
-    auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
-    // Mark beams as EOS as needed.
-    for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
-        llama_beam_view& beam_view = beams_state.beam_views[i];
-        if (!beam_view.eob && is_at_eob(callback_data, beam_view.tokens, beam_view.n_tokens)) {
-            beam_view.eob = true;
-        }
-    }
-    printf(",");  // Show progress
-    if (const size_t n = beams_state.common_prefix_length) {
-        callback_data.response.resize(callback_data.response.size() + n);
-        assert(0u < beams_state.n_beams);
-        const llama_token * tokens = beams_state.beam_views[0].tokens;
-        std::copy(tokens, tokens + n, callback_data.response.end() - n);
-        printf("%zu", n);
-    }
-    fflush(stdout);
-#if 1 // DEBUG: print current beams for this iteration
-    std::cout << "\n\nCurrent beams (last_call=" << beams_state.last_call << "):\n";
-    for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
-        std::cout << "beams["<<i<<"]: " << ostream_beam_view{callback_data.ctx,beams_state.beam_views[i]} << std::endl;
-    }
-#endif
-}
-
-int main(int argc, char ** argv)
-{
-    gpt_params params;
-    //params.n_gpu_layers = 200;
-
-    //---------------------------------
-    // Print help :
-    //---------------------------------
-
-    if ( argc < 2 || argv[1][0] == '-' )
-    {
-        printf( "Usage: %s MODEL_PATH [BEAM_WIDTH=2] [PROMPT]\n" , argv[0] );
-        return 1 ;
-    }
-
-    //---------------------------------
-    // Load parameters :
-    //---------------------------------
-
-    params.model = argv[1];
-
-    params.n_beams = 2 < argc ? std::stoi(argv[2]) : 2;
-
-    if ( argc > 3 )
-    {
-        params.prompt = argv[3];
-    }
-
-    if ( params.prompt.empty() )
-    {
-        params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n";
-    }
-
-    //---------------------------------
-    // Init LLM :
-    //---------------------------------
-
-    llama_backend_init();
-    llama_numa_init(params.numa);
-
-    llama_model * model;
-    llama_context * ctx;
-
-    std::tie(model, ctx) = llama_init_from_gpt_params( params );
-
-    if ( model == NULL )
-    {
-        fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
-        return 1;
-    }
-
-    //---------------------------------
-    // Tokenize the prompt :
-    //---------------------------------
-
-    std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);
-
-    const size_t max_context_size     = llama_n_ctx( ctx );
-    const size_t max_tokens_list_size = max_context_size - 4 ;
-
-    if (tokens_list.size() > max_tokens_list_size)
-    {
-        fprintf( stderr , "%s: error: prompt too long (%zu tokens, max %zu)\n" ,
-             __func__ , tokens_list.size() , max_tokens_list_size );
-        return 1;
-    }
-
-    fprintf( stderr, "\n\n" );
-
-    // Print the tokens from the prompt :
-
-    for( auto id : tokens_list )
-    {
-        std::cout << llama_token_to_piece(ctx, id);
-    }
-    std::cout << std::flush;
-
-    int n_past = 0;
-
-    if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), tokens_list.size(), n_past, 0)))
-    {
-        fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
-        return 1;
-    }
-    n_past += tokens_list.size();
-
-    beam_search_callback_data callback_data{ctx, {}};
-    size_t const beam_width = static_cast<size_t>(params.n_beams);
-    int const n_predict = 256;
-    llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict);
-
-    std::cout << "\n\n";
-    for (llama_token const token_id : callback_data.response) {
-        std::cout << llama_token_to_piece(ctx,token_id);
-    }
-    std::cout << std::endl;
-
-    llama_free( ctx );
-    llama_free_model( model );
-
-    llama_backend_free();
-
-    return 0;
-}
diff --git a/llama.cpp b/llama.cpp
index a3e944874..92c33f53e 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -14711,260 +14711,6 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
     ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
 }
 
-//
-// Beam search
-//
-
-struct llama_beam {
-    std::vector<llama_token> tokens;
-    float p;  // Cumulative beam probability (renormalized relative to all beams)
-    bool eob; // Initialize end-of-beam to false. Callback sets this to true.
-    // Sort beams by probability. In case of ties, prefer beams at eob.
-    bool operator<(const llama_beam & rhs) const {
-        return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob);
-    }
-    // Shift off first n tokens and discard them.
-    void shift_tokens(const size_t n) {
-        if (n) {
-            std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
-            tokens.resize(tokens.size() - n);
-        }
-    }
-    llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; }
-};
-
-// A struct for calculating logit-related info.
-struct llama_logit_info {
-    const float * const logits;
-    const int n_vocab;
-    const float max_l;
-    const float normalizer;
-    struct sum_exp {
-        float max_l;
-        float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
-    };
-    llama_logit_info(llama_context * ctx)
-      : logits(llama_get_logits(ctx))
-      , n_vocab(llama_n_vocab(llama_get_model(ctx)))
-      , max_l(*std::max_element(logits, logits + n_vocab))
-      , normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
-      { }
-    llama_token_data get_token_data(const llama_token token_id) const {
-        constexpr auto p = std::numeric_limits<float>::quiet_NaN();  // never used
-        return {token_id, logits[token_id], p};
-    }
-    // Return top k token_data by logit.
-    std::vector<llama_token_data> top_k(size_t k) {
-        std::vector<llama_token_data> min_heap;  // min-heap by logit
-        const llama_token k_min = std::min(static_cast<llama_token>(k), n_vocab);
-        min_heap.reserve(k_min);
-        for (llama_token token_id = 0 ; token_id < k_min ; ++token_id) {
-            min_heap.push_back(get_token_data(token_id));
-        }
-        auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; };
-        std::make_heap(min_heap.begin(), min_heap.end(), comp);
-        for (llama_token token_id = k_min ; token_id < n_vocab ; ++token_id) {
-            if (min_heap.front().logit < logits[token_id]) {
-                std::pop_heap(min_heap.begin(), min_heap.end(), comp);
-                min_heap.back().id = token_id;
-                min_heap.back().logit = logits[token_id];
-                std::push_heap(min_heap.begin(), min_heap.end(), comp);
-            }
-        }
-        return min_heap;
-    }
-    float probability_from_logit(float logit) const {
-        return normalizer * std::exp(logit - max_l);
-    }
-};
-
-struct llama_beam_search_data {
-    llama_context * ctx;
-    size_t n_beams;
-    int n_past;
-    int n_predict;
-    std::vector<llama_beam> beams;
-    std::vector<llama_beam> next_beams;
-
-    // Re-calculated on each loop iteration
-    size_t common_prefix_length;
-
-    // Used to communicate to/from callback on beams state.
-    std::vector<llama_beam_view> beam_views;
-
-    llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict)
-      : ctx(ctx)
-      , n_beams(n_beams)
-      , n_past(n_past)
-      , n_predict(n_predict)
-      , beam_views(n_beams) {
-        beams.reserve(n_beams);
-        next_beams.reserve(n_beams);
-    }
-
-    // Collapse beams to a single beam given by index.
-    void collapse_beams(const size_t beam_idx) {
-        if (0u < beam_idx) {
-            std::swap(beams[0], beams[beam_idx]);
-        }
-        beams.resize(1);
-    }
-
-    // Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
-    // The repetitive patterns below reflect the 2 stages of heaps:
-    //  * Gather elements until the vector is full, then call std::make_heap() on it.
-    //  * If the heap is full and a new element is found that should be included, pop the
-    //    least element to the back(), replace it with the new, then push it into the heap.
-    void fill_next_beams_by_top_probabilities(llama_beam & beam) {
-        // Min-heaps use a greater-than comparator.
-        const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; };
-        if (beam.eob) {
-            // beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
-            if (next_beams.size() < n_beams) {
-                next_beams.push_back(std::move(beam));
-                if (next_beams.size() == n_beams) {
-                    std::make_heap(next_beams.begin(), next_beams.end(), comp);
-                }
-            } else if (next_beams.front().p < beam.p) {
-                std::pop_heap(next_beams.begin(), next_beams.end(), comp);
-                next_beams.back() = std::move(beam);
-                std::push_heap(next_beams.begin(), next_beams.end(), comp);
-            }
-        } else {
-            // beam is not at end-of-sentence, so branch with next top_k tokens.
-            if (!beam.tokens.empty()) {
-                llama_decode(ctx, llama_batch_get_one(beam.tokens.data(), beam.tokens.size(), n_past, 0));
-            }
-            llama_logit_info logit_info(ctx);
-            std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
-
-            // Clear the kv slot so that other beams may try different tokens at this position. The llama_decode()
-            // call in loop() will conclusively fill in the kv slot once the beams converge at this position.
-            llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
-
-            size_t i=0;
-            if (next_beams.size() < n_beams) {
-                for (; next_beams.size() < n_beams ; ++i) {
-                    llama_beam next_beam = beam;
-                    next_beam.tokens.push_back(next_tokens[i].id);
-                    next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit);
-                    next_beams.push_back(std::move(next_beam));
-                }
-                std::make_heap(next_beams.begin(), next_beams.end(), comp);
-            } else {
-                for (; next_beams.front().p == 0.0f ; ++i) {
-                    std::pop_heap(next_beams.begin(), next_beams.end(), comp);
-                    next_beams.back() = beam;
-                    next_beams.back().tokens.push_back(next_tokens[i].id);
-                    next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit);
-                    std::push_heap(next_beams.begin(), next_beams.end(), comp);
-                }
-            }
-            for (; i < n_beams ; ++i) {
-                const float next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit);
-                if (next_beams.front().p < next_p) {
-                    std::pop_heap(next_beams.begin(), next_beams.end(), comp);
-                    next_beams.back() = beam;
-                    next_beams.back().tokens.push_back(next_tokens[i].id);
-                    next_beams.back().p = next_p;
-                    std::push_heap(next_beams.begin(), next_beams.end(), comp);
-                }
-            }
-        }
-    }
-
-    // Find common_prefix_length based on beams.
-    // Requires beams is not empty.
-    size_t find_common_prefix_length() {
-        size_t common_prefix_length = beams[0].tokens.size();
-        for (size_t i = 1 ; i < beams.size() ; ++i) {
-            common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size());
-            for (size_t j = 0 ; j < common_prefix_length ; ++j) {
-                if (beams[0].tokens[j] != beams[i].tokens[j]) {
-                    common_prefix_length = j;
-                    break;
-                }
-            }
-        }
-        return common_prefix_length;
-    }
-
-    // Construct beams_state to send back to caller via the callback function.
-    // Side effect: set common_prefix_length = find_common_prefix_length();
-    llama_beams_state get_beams_state(const bool last_call) {
-        for (size_t i = 0 ; i < beams.size() ; ++i) {
-            beam_views[i] = beams[i].view();
-        }
-        common_prefix_length = find_common_prefix_length();
-        return {beam_views.data(), beams.size(), common_prefix_length, last_call};
-    }
-
-    // Loop:
-    //  * while i < n_predict, AND
-    //  * any of the beams have not yet reached end-of-beam (eob), AND
-    //  * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
-    //    (since all other beam probabilities can only decrease)
-    void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) {
-        beams.push_back({{}, 1.0f, false});  // Start with one empty beam w/ probability = 1.0 and !eob.
-        const auto not_eob = [](const llama_beam & beam) { return !beam.eob; };
-        for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) &&
-                       !beams[top_beam_index()].eob ; ++i) {
-            callback(callback_data, get_beams_state(false));  // Sets common_prefix_length
-            update_beams_from_beam_views();   // Update values (p,eob) that callback may have changed.
-            if (common_prefix_length) {
-                llama_decode(ctx, llama_batch_get_one(beams[0].tokens.data(), common_prefix_length, n_past, 0));
-                n_past += common_prefix_length;
-            }
-            // Zero-out next_beam probabilities to place them last in following min-heap.
-            std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam & beam) { beam.p = 0.0f; });
-            for (llama_beam & beam : beams) {
-                beam.shift_tokens(common_prefix_length);
-                fill_next_beams_by_top_probabilities(beam);
-            }
-            // next_beams become the beams of next/final iteration. Swap them to re-use memory.
-            beams.swap(next_beams);
-            renormalize_beam_probabilities(beams);
-        }
-        collapse_beams(top_beam_index());
-        callback(callback_data, get_beams_state(true));
-    }
-
-    // As beams grow, the cumulative probabilities decrease.
-    // Renormalize them to avoid floating point underflow.
-    static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) {
-        const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; };
-        const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
-        std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; });
-    }
-
-    // Assumes beams is non-empty.  Uses llama_beam::operator<() for ordering.
-    size_t top_beam_index() {
-        return std::max_element(beams.begin(), beams.end()) - beams.begin();
-    }
-
-    // Copy (p,eob) for each beam which may have been changed by the callback.
-    void update_beams_from_beam_views() {
-        for (size_t i = 0 ; i < beams.size() ; ++i) {
-            beams[i].p = beam_views[i].p;
-            beams[i].eob = beam_views[i].eob;
-        }
-    }
-};
-
-void llama_beam_search(llama_context * ctx,
-                       llama_beam_search_callback_fn_t callback, void * callback_data,
-                       size_t n_beams, int n_past, int n_predict) {
-    assert(ctx);
-    const int64_t t_start_sample_us = ggml_time_us();
-
-    llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict);
-
-    beam_search_data.loop(callback, callback_data);
-
-    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
-    ctx->n_sample++;
-}
-
 //
 // quantization
 //
diff --git a/llama.h b/llama.h
index a78ccdaf5..b2a302dad 100644
--- a/llama.h
+++ b/llama.h
@@ -1056,49 +1056,9 @@ extern "C" {
                      llama_token   token);
 
     //
-    // Beam search
+    // Model split
     //
 
-    struct llama_beam_view {
-        const llama_token * tokens;
-
-        size_t n_tokens;
-        float  p;        // Cumulative beam probability (renormalized relative to all beams)
-        bool   eob;      // Callback should set this to true when a beam is at end-of-beam.
-    };
-
-    // Passed to beam_search_callback function.
-    // Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
-    // (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
-    // These pointers are valid only during the synchronous callback, so should not be saved.
-    struct llama_beams_state {
-        struct llama_beam_view * beam_views;
-
-        size_t n_beams;               // Number of elements in beam_views[].
-        size_t common_prefix_length;  // Current max length of prefix tokens shared by all beams.
-        bool   last_call;             // True iff this is the last callback invocation.
-    };
-
-    // Type of pointer to the beam_search_callback function.
-    // void* callback_data is any custom data passed to llama_beam_search, that is subsequently
-    // passed back to beam_search_callback. This avoids having to use global variables in the callback.
-    typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state);
-
-    /// @details Deterministically returns entire sentence constructed by a beam search.
-    /// @param ctx Pointer to the llama_context.
-    /// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
-    /// @param callback_data A pointer that is simply passed back to callback.
-    /// @param n_beams Number of beams to use.
-    /// @param n_past Number of tokens already evaluated.
-    /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
-    LLAMA_API void llama_beam_search(
-                   struct llama_context * ctx,
-        llama_beam_search_callback_fn_t   callback,
-                                   void * callback_data,
-                                 size_t   n_beams,
-                                int32_t   n_past,
-                                int32_t   n_predict);
-
     /// @details Build a split GGUF final path for this chunk.
     ///          llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
     //  Returns the split_path length.

From 554c247caffed64465f372661f2826640cb10430 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 4 Jun 2024 21:23:20 +0300
Subject: [PATCH 06/39] ggml : remove OpenCL (#7735)

ggml-ci
---
 .github/workflows/build.yml          |   36 +-
 CMakeLists.txt                       |   23 +-
 Makefile                             |   17 -
 README-sycl.md                       |    2 +-
 README.md                            |  116 +-
 common/common.cpp                    |    1 -
 examples/llama-bench/README.md       |    9 +-
 examples/llama-bench/llama-bench.cpp |   11 +-
 examples/main-cmake-pkg/README.md    |    8 +-
 flake.nix                            |    1 -
 ggml-metal.h                         |    2 +-
 ggml-opencl.cpp                      | 2305 --------------------------
 ggml-opencl.h                        |   36 -
 ggml.c                               |   62 +-
 ggml.h                               |    1 -
 llama.cpp                            |   10 +-
 scripts/LlamaConfig.cmake.in         |    5 -
 scripts/compare-llama-bench.py       |    6 +-
 scripts/server-llm.sh                |   11 +-
 scripts/sync-ggml-am.sh              |    4 -
 scripts/sync-ggml.sh                 |    2 -
 21 files changed, 29 insertions(+), 2639 deletions(-)
 delete mode 100644 ggml-opencl.cpp
 delete mode 100644 ggml-opencl.h

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index e824136a5..93669d531 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -688,8 +688,6 @@ jobs:
 
     env:
       OPENBLAS_VERSION: 0.3.23
-      OPENCL_VERSION: 2023.04.17
-      CLBLAST_VERSION: 1.6.0
       SDE_VERSION: 9.33.0-2024-01-07
       VULKAN_VERSION: 1.3.261.1
 
@@ -706,8 +704,6 @@ jobs:
             defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
           - build: 'avx512-x64'
             defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
-          - build: 'clblast-x64'
-            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
           - build: 'openblas-x64'
             defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
           - build: 'kompute-x64'
@@ -732,27 +728,6 @@ jobs:
         run: |
           git submodule update --init kompute
 
-      - name: Download OpenCL SDK
-        id: get_opencl
-        if: ${{ matrix.build == 'clblast-x64' }}
-        run: |
-          curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip"
-          mkdir $env:RUNNER_TEMP/opencl
-          tar.exe -xvf $env:RUNNER_TEMP/opencl.zip --strip-components=1 -C $env:RUNNER_TEMP/opencl
-
-      - name: Download CLBlast
-        id: get_clblast
-        if: ${{ matrix.build == 'clblast-x64' }}
-        run: |
-          curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
-          curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
-          7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z
-          rename-item $env:RUNNER_TEMP/CLBlast-${env:CLBLAST_VERSION}-windows-x64 clblast
-          foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) {
-            $txt = Get-Content -Path $f -Raw
-            $txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
-          }
-
       - name: Download OpenBLAS
         id: get_openblas
         if: ${{ matrix.build == 'openblas-x64' }}
@@ -786,13 +761,6 @@ jobs:
           cmake -S . -B build ${{ matrix.defines }}
           cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
 
-      - name: Add clblast.dll
-        id: add_clblast_dll
-        if: ${{ matrix.build == 'clblast-x64' }}
-        run: |
-          cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release
-          cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt
-
       - name: Add libopenblas.dll
         id: add_libopenblas_dll
         if: ${{ matrix.build == 'openblas-x64' }}
@@ -816,7 +784,7 @@ jobs:
       - name: Test
         id: cmake_test
         # not all machines have native AVX-512
-        if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'clblast-x64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
+        if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
         run: |
           cd build
           ctest -L main -C Release --verbose --timeout 900
@@ -1071,7 +1039,7 @@ jobs:
 #        hypervisor: 'qemu'
 #        run: |
 #            sudo pkg update
-#            sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas
+#            sudo pkg install -y gmake automake autoconf pkgconf llvm15 openblas
 #            gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j `sysctl -n hw.ncpu`
 
   release:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 76ea27412..cf37d5bb2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -111,7 +111,6 @@ option(LLAMA_CUDA_FA_ALL_QUANTS              "llama: compile all quants for Flas
 option(LLAMA_CURL                            "llama: use libcurl to download model from an URL" OFF)
 option(LLAMA_HIPBLAS                         "llama: use hipBLAS"                               OFF)
 option(LLAMA_HIP_UMA                         "llama: use HIP unified memory architecture"       OFF)
-option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
 option(LLAMA_VULKAN                          "llama: use Vulkan"                                OFF)
 option(LLAMA_VULKAN_CHECK_RESULTS            "llama: run Vulkan op checks"                      OFF)
 option(LLAMA_VULKAN_DEBUG                    "llama: enable Vulkan debug output"                OFF)
@@ -502,22 +501,6 @@ if (LLAMA_RPC)
     set(GGML_SOURCES_RPC ggml-rpc.cpp)
 endif()
 
-if (LLAMA_CLBLAST)
-    find_package(CLBlast)
-    if (CLBlast_FOUND)
-        message(STATUS "CLBlast found")
-
-        set(GGML_HEADERS_OPENCL ggml-opencl.h)
-        set(GGML_SOURCES_OPENCL ggml-opencl.cpp)
-
-        add_compile_definitions(GGML_USE_CLBLAST)
-
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} clblast)
-    else()
-        message(WARNING "CLBlast not found")
-    endif()
-endif()
-
 if (LLAMA_VULKAN)
     find_package(Vulkan)
     if (Vulkan_FOUND)
@@ -1265,7 +1248,6 @@ add_library(ggml OBJECT
             ggml-quants.c
             ggml-quants.h
             ${GGML_SOURCES_CUDA}      ${GGML_HEADERS_CUDA}
-            ${GGML_SOURCES_OPENCL}    ${GGML_HEADERS_OPENCL}
             ${GGML_SOURCES_METAL}     ${GGML_HEADERS_METAL}
             ${GGML_SOURCES_RPC}       ${GGML_HEADERS_RPC}
             ${GGML_SOURCES_EXTRA}     ${GGML_HEADERS_EXTRA}
@@ -1353,8 +1335,9 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
         DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)
 
 set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
-        "${GGML_HEADERS_CUDA}"  "${GGML_HEADERS_OPENCL}"
-        "${GGML_HEADERS_METAL}" "${GGML_HEADERS_EXTRA}")
+        "${GGML_HEADERS_CUDA}"
+        "${GGML_HEADERS_METAL}"
+        "${GGML_HEADERS_EXTRA}")
 
 set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
 install(TARGETS ggml PUBLIC_HEADER)
diff --git a/Makefile b/Makefile
index 27eb69871..802ee6a47 100644
--- a/Makefile
+++ b/Makefile
@@ -547,23 +547,6 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h
 	$(NVCC_COMPILE)
 endif # LLAMA_CUDA
 
-ifdef LLAMA_CLBLAST
-	MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
-	MK_CFLAGS   += $(shell pkg-config --cflags-only-other clblast OpenCL)
-	MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
-
-	# Mac provides OpenCL as a framework
-	ifeq ($(UNAME_S),Darwin)
-		MK_LDFLAGS += -lclblast -framework OpenCL
-	else
-		MK_LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
-	endif
-	OBJS    += ggml-opencl.o
-
-ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-endif # LLAMA_CLBLAST
-
 ifdef LLAMA_VULKAN
 	MK_CPPFLAGS  += -DGGML_USE_VULKAN
 	MK_LDFLAGS += -lvulkan
diff --git a/README-sycl.md b/README-sycl.md
index 37f0306dc..62b38135c 100644
--- a/README-sycl.md
+++ b/README-sycl.md
@@ -29,7 +29,7 @@ The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based o
 
 When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneMKL](README.md#intel-onemkl) backend.
 
-It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, CLBlast etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
+It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
 
 ## News
 
diff --git a/README.md b/README.md
index 1f2d9b1f2..9d2a59d89 100644
--- a/README.md
+++ b/README.md
@@ -77,7 +77,7 @@ variety of hardware - locally and in the cloud.
 - AVX, AVX2 and AVX512 support for x86 architectures
 - 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
 - Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP)
-- Vulkan, SYCL, and (partial) OpenCL backend support
+- Vulkan and SYCL backend support
 - CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
 
 Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has
@@ -371,16 +371,11 @@ In order to build llama.cpp you have four different options.
     3. Install compilation dependencies.
 
         ```bash
-        sudo pkg install gmake automake autoconf pkgconf llvm15 clinfo clover \
-            opencl clblast openblas
+        sudo pkg install gmake automake autoconf pkgconf llvm15 openblas
 
         gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
         ```
 
-    **Notes:** With this packages you can build llama.cpp with OPENBLAS and
-    CLBLAST support for use OpenCL GPU acceleration in FreeBSD. Please read
-    the instructions for use and activate this options in this document below.
-
 ### Homebrew
 
 On Mac and Linux, the homebrew package manager can be used via
@@ -399,7 +394,7 @@ argument.
 
 ### BLAS Build
 
-Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use:
+Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS. There are currently several different BLAS implementations available for build and use:
 
 - #### Accelerate Framework:
 
@@ -553,111 +548,6 @@ Building the program with BLAS support may lead to some performance improvements
   | LLAMA_CUDA_MMV_Y        | Positive integer       | 1       | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants.                                                                       |
   | LLAMA_CUDA_KQUANTS_ITER | 1 or 2                 | 2       | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                             |
 
-- #### CLBlast
-
-  OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.
-
-  You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK).
-    - For Ubuntu, Debian, and Fedora the packages `opencl-headers`, `ocl-icd` may be needed.
-
-    - For Windows, a pre-built SDK is available on the [OpenCL Releases](https://github.com/KhronosGroup/OpenCL-SDK/releases) page.
-
-    - <details>
-        <summary>Installing the OpenCL SDK from source</summary>
-
-        ```sh
-        git clone --recurse-submodules https://github.com/KhronosGroup/OpenCL-SDK.git
-        cd OpenCL-SDK
-        cmake -B build -DBUILD_DOCS=OFF \
-          -DBUILD_EXAMPLES=OFF \
-          -DBUILD_TESTING=OFF \
-          -DOPENCL_SDK_BUILD_SAMPLES=OFF \
-          -DOPENCL_SDK_TEST_SAMPLES=OFF
-        cmake --build build
-        cmake --install build --prefix /some/path
-        ```
-      </details>
-
-  ##### Installing CLBlast
-
-  Pre-built CLBlast binaries may be found on the [CLBlast Releases](https://github.com/CNugteren/CLBlast/releases) page. For Unix variants, it may also be found in your operating system's packages.
-
-  Linux packaging:
-  Fedora Linux:
-  ```bash
-  sudo dnf install clblast
-  ```
-
-  Alternatively, they may be built from source.
-
-  - <details>
-    <summary>Windows:</summary>
-
-      ```cmd
-      set OPENCL_SDK_ROOT="C:/OpenCL-SDK-v2023.04.17-Win-x64"
-      git clone https://github.com/CNugteren/CLBlast.git
-      cd CLBlast
-      cmake -B build -DBUILD_SHARED_LIBS=OFF -DOVERRIDE_MSVC_FLAGS_TO_MT=OFF -DTUNERS=OFF -DOPENCL_ROOT=%OPENCL_SDK_ROOT% -G "Visual Studio 17 2022" -A x64
-      cmake --build build --config Release
-      cmake --install build --prefix C:/CLBlast
-      ```
-
-      (note: `--config Release` at build time is the default and only relevant for Visual Studio builds - or multi-config Ninja builds)
-
-  - <details>
-    <summary>Unix:</summary>
-
-      ```sh
-      git clone https://github.com/CNugteren/CLBlast.git
-      cd CLBlast
-      cmake -B build -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF
-      cmake --build build --config Release
-      cmake --install build --prefix /some/path
-      ```
-
-      Where `/some/path` is where the built library will be installed (default is `/usr/local`).
-    </details>
-
-  ##### Building Llama with CLBlast
-
-  - Build with make:
-    ```sh
-    make LLAMA_CLBLAST=1
-    ```
-  - CMake (Unix):
-    ```sh
-    cmake -B build -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
-    cmake --build build --config Release
-    ```
-  - CMake (Windows):
-    ```cmd
-    set CL_BLAST_CMAKE_PKG="C:/CLBlast/lib/cmake/CLBlast"
-    git clone https://github.com/ggerganov/llama.cpp
-    cd llama.cpp
-    cmake -B build -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=%CL_BLAST_CMAKE_PKG% -G "Visual Studio 17 2022" -A x64
-    cmake --build build --config Release
-    cmake --install build --prefix C:/LlamaCPP
-    ```
-
-  ##### Running Llama with CLBlast
-
-  The CLBlast build supports `--gpu-layers|-ngl` like the CUDA version does.
-
-  To select the correct platform (driver) and device (GPU), you can use the environment variables `GGML_OPENCL_PLATFORM` and `GGML_OPENCL_DEVICE`.
-  The selection can be a number (starting from 0) or a text string to search:
-
-  ```sh
-  GGML_OPENCL_PLATFORM=1 ./main ...
-  GGML_OPENCL_DEVICE=2 ./main ...
-  GGML_OPENCL_PLATFORM=Intel ./main ...
-  GGML_OPENCL_PLATFORM=AMD GGML_OPENCL_DEVICE=1 ./main ...
-  ```
-
-  The default behavior is to find the first GPU device, but when it is an integrated GPU on a laptop, for instance, the selectors are useful.
-  Using the variables it is possible to select a CPU-based driver as well, if so desired.
-
-  You can get a list of platforms and devices from the `clinfo -l` command, etc.
-
 - #### Vulkan
 
   **With docker**:
diff --git a/common/common.cpp b/common/common.cpp
index 022bfe287..df583db83 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -2844,7 +2844,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
     fprintf(stream, "cpu_has_cuda: %s\n",        ggml_cpu_has_cuda()        ? "true" : "false");
     fprintf(stream, "cpu_has_vulkan: %s\n",      ggml_cpu_has_vulkan()      ? "true" : "false");
-    fprintf(stream, "cpu_has_clblast: %s\n",     ggml_cpu_has_clblast()     ? "true" : "false");
     fprintf(stream, "cpu_has_kompute: %s\n",     ggml_cpu_has_kompute()     ? "true" : "false");
     fprintf(stream, "cpu_has_fma: %s\n",         ggml_cpu_has_fma()         ? "true" : "false");
     fprintf(stream, "cpu_has_gpublas: %s\n",     ggml_cpu_has_gpublas()     ? "true" : "false");
diff --git a/examples/llama-bench/README.md b/examples/llama-bench/README.md
index 857840564..fd95b35f4 100644
--- a/examples/llama-bench/README.md
+++ b/examples/llama-bench/README.md
@@ -162,7 +162,7 @@ $ ./llama-bench -o csv
 ```
 
 ```csv
-build_commit,build_number,cuda,opencl,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
+build_commit,build_number,cuda,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
 "3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961"
 "3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342"
 ```
@@ -179,7 +179,6 @@ $ ./llama-bench -o json
     "build_commit": "3469684",
     "build_number": 1275,
     "cuda": true,
-    "opencl": false,
     "metal": false,
     "gpu_blas": true,
     "blas": true,
@@ -210,7 +209,6 @@ $ ./llama-bench -o json
     "build_commit": "3469684",
     "build_number": 1275,
     "cuda": true,
-    "opencl": false,
     "metal": false,
     "gpu_blas": true,
     "blas": true,
@@ -253,7 +251,6 @@ CREATE TABLE IF NOT EXISTS test (
   build_commit TEXT,
   build_number INTEGER,
   cuda INTEGER,
-  opencl INTEGER,
   metal INTEGER,
   gpu_blas INTEGER,
   blas INTEGER,
@@ -279,6 +276,6 @@ CREATE TABLE IF NOT EXISTS test (
   stddev_ts REAL
 );
 
-INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
-INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
+INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
+INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
 ```
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 5d3cbd842..fa7ad1bdb 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -723,7 +723,6 @@ struct test {
     static const std::string build_commit;
     static const int build_number;
     static const bool cuda;
-    static const bool opencl;
     static const bool vulkan;
     static const bool kompute;
     static const bool metal;
@@ -812,9 +811,6 @@ struct test {
         if (cuda) {
             return GGML_CUDA_NAME;
         }
-        if (opencl) {
-            return "OpenCL";
-        }
         if (vulkan) {
             return "Vulkan";
         }
@@ -843,7 +839,7 @@ struct test {
     static const std::vector<std::string> & get_fields() {
         static const std::vector<std::string> fields = {
             "build_commit", "build_number",
-            "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas",
+            "cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas",
             "cpu_info", "gpu_info",
             "model_filename", "model_type", "model_size", "model_n_params",
             "n_batch", "n_ubatch",
@@ -869,7 +865,7 @@ struct test {
             field == "avg_ns" || field == "stddev_ns") {
             return INT;
         }
-        if (field == "cuda" || field == "opencl"  || field == "vulkan" || field == "kompute" || field == "metal" ||
+        if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
             field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
             field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
             return BOOL;
@@ -898,7 +894,7 @@ struct test {
         }
         std::vector<std::string> values = {
             build_commit, std::to_string(build_number),
-            std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan),
+            std::to_string(cuda), std::to_string(vulkan), std::to_string(vulkan),
             std::to_string(metal), std::to_string(sycl), std::to_string(rpc), std::to_string(gpu_blas), std::to_string(blas),
             cpu_info, gpu_info,
             model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
@@ -927,7 +923,6 @@ struct test {
 const std::string test::build_commit = LLAMA_COMMIT;
 const int         test::build_number = LLAMA_BUILD_NUMBER;
 const bool        test::cuda         = !!ggml_cpu_has_cuda();
-const bool        test::opencl       = !!ggml_cpu_has_clblast();
 const bool        test::vulkan       = !!ggml_cpu_has_vulkan();
 const bool        test::kompute      = !!ggml_cpu_has_kompute();
 const bool        test::metal        = !!ggml_cpu_has_metal();
diff --git a/examples/main-cmake-pkg/README.md b/examples/main-cmake-pkg/README.md
index edf20d8db..a88e92f23 100644
--- a/examples/main-cmake-pkg/README.md
+++ b/examples/main-cmake-pkg/README.md
@@ -8,16 +8,14 @@ Because this example is "outside of the source tree", it is important to first b
 
 ### Considerations
 
-When hardware acceleration libraries are used (e.g. CUDA, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_.
+When hardware acceleration libraries are used (e.g. CUDA, Metal, etc.), CMake must be able to locate the associated CMake package.
 
 ### Build llama.cpp and install to C:\LlamaCPP directory
 
-In this case, CLBlast was already installed so the CMake package is referenced in `CMAKE_PREFIX_PATH`.
-
 ```cmd
 git clone https://github.com/ggerganov/llama.cpp
 cd llama.cpp
-cmake -B build -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=C:/CLBlast/lib/cmake/CLBlast -G "Visual Studio 17 2022" -A x64
+cmake -B build -DBUILD_SHARED_LIBS=OFF -G "Visual Studio 17 2022" -A x64
 cmake --build build --config Release
 cmake --install build --prefix C:/LlamaCPP
 ```
@@ -27,7 +25,7 @@ cmake --install build --prefix C:/LlamaCPP
 
 ```cmd
 cd ..\examples\main-cmake-pkg
-cmake -B build -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/CLBlast/lib/cmake/CLBlast;C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64
+cmake -B build -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64
 cmake --build build --config Release
 cmake --install build --prefix C:/MyLlamaApp
 ```
diff --git a/flake.nix b/flake.nix
index 9cd3756e5..0a52ea52e 100644
--- a/flake.nix
+++ b/flake.nix
@@ -159,7 +159,6 @@
                 windows = config.legacyPackages.llamaPackagesWindows.llama-cpp;
               }
               // lib.optionalAttrs pkgs.stdenv.isLinux {
-                opencl = config.packages.default.override { useOpenCL = true; };
                 cuda = config.legacyPackages.llamaPackagesCuda.llama-cpp;
 
                 mpi-cpu = config.packages.default.override { useMpi = true; };
diff --git a/ggml-metal.h b/ggml-metal.h
index a5c542189..e7543ae79 100644
--- a/ggml-metal.h
+++ b/ggml-metal.h
@@ -1,7 +1,7 @@
 // An interface allowing to compute ggml_cgraph with Metal
 //
 // This is a fully functional interface that extends ggml with GPU support for Apple devices.
-// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, OpenCL, etc.)
+// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, etc.)
 //
 // How it works?
 //
diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp
deleted file mode 100644
index e28566a7b..000000000
--- a/ggml-opencl.cpp
+++ /dev/null
@@ -1,2305 +0,0 @@
-#include "ggml.h"
-#include "ggml-opencl.h"
-#include "ggml-backend-impl.h"
-
-#include <array>
-#include <atomic>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <limits>
-#include <sstream>
-#include <vector>
-
-#define CL_TARGET_OPENCL_VERSION 120
-#include <clblast.h>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-#define CL_DMMV_LOCAL_SIZE 32
-
-#ifndef K_QUANTS_PER_ITERATION
-#define K_QUANTS_PER_ITERATION 1
-#else
-static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
-#endif
-
-#define MULTILINE_QUOTE(...) #__VA_ARGS__
-static std::string program_source = MULTILINE_QUOTE(
-
-typedef char int8_t;
-typedef uchar uint8_t;
-typedef short int16_t;
-typedef ushort uint16_t;
-typedef int int32_t;
-typedef uint uint32_t;
-
-struct __attribute__ ((packed)) block_q4_0
-{
-    half d;
-    uint8_t qs[QK4_0 / 2];
-};
-
-struct __attribute__ ((packed)) block_q4_1
-{
-    half d;
-    half m;
-    uint8_t qs[QK4_1 / 2];
-};
-
-struct __attribute__ ((packed)) block_q5_0
-{
-    half d;
-    uint32_t qh;
-    uint8_t qs[QK5_0 / 2];
-};
-
-struct __attribute__ ((packed)) block_q5_1
-{
-    half d;
-    half m;
-    uint32_t qh;
-    uint8_t qs[QK5_1 / 2];
-};
-
-struct __attribute__ ((packed)) block_q8_0
-{
-    half d;
-    int8_t qs[QK8_0];
-};
-
-struct __attribute__((packed)) block_q2_K
-{
-    uint8_t scales[16];
-    uint8_t qs[64];
-    half d;
-    half dmin;
-};
-
-struct __attribute__((packed)) block_q3_K
-{
-    uint8_t hmask[32];
-    uint8_t qs[64];
-    uint8_t scales[12];
-    half d;
-};
-
-struct __attribute__((packed)) block_q4_K
-{
-    half d;
-    half dmin;
-    uint8_t scales[12];
-    uint8_t qs[128];
-};
-
-struct __attribute__((packed)) block_q5_K
-{
-    half d;
-    half dmin;
-    uint8_t scales[12];
-    uint8_t qh[32];
-    uint8_t qs[128];
-};
-
-struct __attribute__((packed)) block_q6_K
-{
-    uint8_t ql[128];
-    uint8_t qh[64];
-    int8_t scales[16];
-    half d;
-};
-
-__kernel void convert_fp16_to_fp32(__global half* x, __global float* y) {
-    const uint i = get_global_id(0);
-
-    y[i] = vload_half(0, &x[i]);
-}
-
-void dequantize_q4_0(__global const struct block_q4_0* x, const int ib, const int iqs, float* v0, float* v1) {
-    const float d = vload_half(0, &x[ib].d);
-
-    const uint8_t vui = x[ib].qs[iqs];
-
-    const int8_t vi0 = vui & 0xF;
-    const int8_t vi1 = vui >> 4;
-
-    *v0 = (vi0 - 8)*d;
-    *v1 = (vi1 - 8)*d;
-}
-void dequantize_q4_1(__global const struct block_q4_1* x, const int ib, const int iqs, float* v0, float* v1) {
-    const float d = vload_half(0, &x[ib].d);
-    const float m = vload_half(0, &x[ib].m);
-
-    const uint8_t vui = x[ib].qs[iqs];
-
-    const int8_t vi0 = vui & 0xF;
-    const int8_t vi1 = vui >> 4;
-
-    *v0 = vi0*d + m;
-    *v1 = vi1*d + m;
-}
-void dequantize_q5_0(__global const struct block_q5_0* x, const int ib, const int iqs, float* v0, float* v1) {
-    const float d = vload_half(0, &x[ib].d);
-
-    uint32_t qh = x[ib].qh;
-
-    const uint8_t xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
-    const uint8_t xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
-
-    const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0) - 16;
-    const int32_t x1 = ((x[ib].qs[iqs] >>  4) | xh_1) - 16;
-
-    *v0 = x0*d;
-    *v1 = x1*d;
-}
-void dequantize_q5_1(__global const struct block_q5_1* x, const int ib, const int iqs, float* v0, float* v1) {
-    const float d = vload_half(0, &x[ib].d);
-    const float m = vload_half(0, &x[ib].m);
-
-    uint32_t qh = x[ib].qh;
-
-    const uint8_t xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
-    const uint8_t xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
-
-    const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0);
-    const int32_t x1 = ((x[ib].qs[iqs] >>  4) | xh_1);
-
-    *v0 = x0*d + m;
-    *v1 = x1*d + m;
-}
-void dequantize_q8_0(__global const struct block_q8_0* x, const int ib, const int iqs, float* v0, float* v1) {
-    const float d = vload_half(0, &x[ib].d);
-
-    const int8_t vi0 = x[ib].qs[iqs + 0];
-    const int8_t vi1 = x[ib].qs[iqs + 1];
-
-    *v0 = vi0*d;
-    *v1 = vi1*d;
-}
-void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float* v1){
-    *v0 = vload_half(0, &x[ib + 0]);
-    *v1 = vload_half(0, &x[ib + 1]);
-}
-);
-
-static std::string k_quants_source = MULTILINE_QUOTE(
-inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8_t *m)
-{
-    if (j < 4)
-    {
-        *d = q[j] & 63;
-        *m = q[j + 4] & 63;
-    }
-    else
-    {
-        *d = (q[j + 4] & 0xF) | ((q[j - 4] >> 6) << 4);
-        *m = (q[j + 4] >> 4) | ((q[j - 0] >> 6) << 4);
-    }
-}
-
-__kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __global float *yy)
-{
-    const int i = get_group_id(0) + get_global_offset(0);
-    const int tid = get_local_id(0);
-    const int n = tid / 32;
-    const int l = tid - 32 * n;
-    const int is = 8 * n + l / 16;
-
-    const uint8_t q = x[i].qs[32 * n + l];
-    __global float *y = yy + get_group_id(0) * QK_K + 128 * n;
-
-    const float dall = vload_half(0, &x[i].d);
-    const float dmin = vload_half(0, &x[i].dmin);
-
-    y[l + 0] = dall * (x[i].scales[is + 0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is + 0] >> 4);
-    y[l + 32] = dall * (x[i].scales[is + 2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is + 2] >> 4);
-    y[l + 64] = dall * (x[i].scales[is + 4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is + 4] >> 4);
-    y[l + 96] = dall * (x[i].scales[is + 6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is + 6] >> 4);
-}
-
-__kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __global float *yy)
-{
-    int r = get_local_id(0) / 4;
-    int i = get_group_id(0) + get_global_offset(0);
-    int tid = r / 2;
-    int is0 = r % 2;
-    int l0 = 16 * is0 + 4 * (get_local_id(0) % 4);
-    int n = tid / 4;
-    int j = tid - 4 * n;
-
-    uint8_t m = 1 << (4 * n + j);
-    int is = 8 * n + 2 * j + is0;
-    int shift = 2 * j;
-
-    int8_t us = is < 4 ? (x[i].scales[is - 0] & 0xF) | (((x[i].scales[is + 8] >> 0) & 3) << 4)
-              : is < 8 ? (x[i].scales[is - 0] & 0xF) | (((x[i].scales[is + 4] >> 2) & 3) << 4)
-              : is < 12  ? (x[i].scales[is - 8] >> 4) | (((x[i].scales[is + 0] >> 4) & 3) << 4)
-              : (x[i].scales[is - 8] >> 4) | (((x[i].scales[is - 4] >> 6) & 3) << 4);
-    float d_all = vload_half(0, &x[i].d);
-    float dl = d_all * (us - 32);
-
-    __global float *y = yy + get_group_id(0) * QK_K + 128 * n + 32 * j;
-    const __global uint8_t *q = x[i].qs + 32 * n;
-    const __global uint8_t *hm = x[i].hmask;
-
-    for (int l = l0; l < l0 + 4; ++l)
-        y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
-}
-
-__kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __global float *yy)
-{
-    const int i = get_group_id(0) + get_global_offset(0);
-    const int tid = get_local_id(0);
-    const int il = tid / 8;
-    const int ir = tid % 8;
-    const int is = 2 * il;
-    const int n = 4;
-
-    __global float *y = yy + get_group_id(0) * QK_K + 64 * il + n * ir;
-
-    const float dall = vload_half(0, &x[i].d);
-    const float dmin = vload_half(0, &x[i].dmin);
-
-    __global const uint8_t *q = x[i].qs + 32 * il + n * ir;
-
-    uint8_t sc, m;
-    get_scale_min_k4(is + 0, x[i].scales, &sc, &m);
-    float d1 = dall * sc;
-    float m1 = dmin * m;
-    get_scale_min_k4(is + 1, x[i].scales, &sc, &m);
-    float d2 = dall * sc;
-    float m2 = dmin * m;
-    for (int l = 0; l < n; ++l)
-    {
-        y[l + 0] = d1 * (q[l] & 0xF) - m1;
-        y[l + 32] = d2 * (q[l] >> 4) - m2;
-    }
-}
-
-__kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __global float *yy)
-{
-    const int i = get_group_id(0) + get_global_offset(0);
-    const int tid = get_local_id(0);
-    const int il = tid / 16;
-    const int ir = tid % 16;
-    const int is = 2 * il;
-
-    __global float *y = yy + get_group_id(0) * QK_K + 64 * il + 2 * ir;
-
-    const float dall = vload_half(0, &x[i].d);
-    const float dmin = vload_half(0, &x[i].dmin);
-
-    __global const uint8_t *ql = x[i].qs + 32 * il + 2 * ir;
-    __global const uint8_t *qh = x[i].qh + 2 * ir;
-
-    uint8_t sc, m;
-    get_scale_min_k4(is + 0, x[i].scales, &sc, &m);
-    const float d1 = dall * sc;
-    const float m1 = dmin * m;
-    get_scale_min_k4(is + 1, x[i].scales, &sc, &m);
-    const float d2 = dall * sc;
-    const float m2 = dmin * m;
-
-    uint8_t hm = 1 << (2 * il);
-    y[0] = d1 * ((ql[0] & 0xF) + (qh[0] & hm ? 16 : 0)) - m1;
-    y[1] = d1 * ((ql[1] & 0xF) + (qh[1] & hm ? 16 : 0)) - m1;
-    hm <<= 1;
-    y[32] = d2 * ((ql[0] >> 4) + (qh[0] & hm ? 16 : 0)) - m2;
-    y[33] = d2 * ((ql[1] >> 4) + (qh[1] & hm ? 16 : 0)) - m2;
-}
-
-__kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __global float *yy)
-{
-    const int i = get_group_id(0) + get_global_offset(0);
-    const int tid = get_local_id(0);
-    const int ip = tid / 32;
-    const int il = tid - 32 * ip;
-    const int is = 8 * ip + il / 16;
-
-    __global float *y = yy + get_group_id(0) * QK_K + 128 * ip + il;
-
-    const float d = vload_half(0, &x[i].d);
-
-    __global const uint8_t *ql = x[i].ql + 64 * ip + il;
-    const uint8_t qh = x[i].qh[32 * ip + il];
-    __global const int8_t *sc = x[i].scales + is;
-
-    y[0] = d * sc[0] * ((int8_t)((ql[0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
-    y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
-    y[64] = d * sc[4] * ((int8_t)((ql[0] >> 4) | (((qh >> 4) & 3) << 4)) - 32);
-    y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32);
-}
-
-__kernel void dequantize_mul_mat_vec_q2_K(__global const struct block_q2_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
-
-    const int row = get_group_id(0);
-
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row + get_global_offset(0);
-
-    __global const struct block_q2_K * x = xx + ib0;
-
-    const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION;  // 0...31 or 0...15
-    const int ix  = get_local_id(0)%K_QUANTS_PER_ITERATION;  // 0 or 0,1
-
-    const int step = 16/K_QUANTS_PER_ITERATION;
-
-    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
-    const int in = tid - step*im;                        // 0...15 or 0...7
-
-    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15 or 0...14 in steps of 2
-    const int q_offset = 32*im + l0;
-    const int s_offset = 8*im;
-    const int y_offset = 128*im + l0;
-
-    tmp[16 * ix + tid] = 0;
-
-    uint32_t aux[4];
-    const uint8_t * d = (const uint8_t *)aux;
-    const uint8_t * m = (const uint8_t *)(aux + 2);
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        __global const float   * y = yy + i * QK_K + y_offset;
-        __global const uint8_t * q = x[i].qs + q_offset;
-
-        const float dall = vload_half(0, &x[i].d);
-        const float dmin = vload_half(0, &x[i].dmin);
-
-        __global const uint32_t * a = (__global const uint32_t *)(x[i].scales + s_offset);
-        aux[0] = a[0] & 0x0f0f0f0f;
-        aux[1] = a[1] & 0x0f0f0f0f;
-        aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
-        aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
-
-        float sum1 = 0, sum2 = 0;
-        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
-            sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
-                  + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
-                  + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
-                  + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
-                  + y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
-                  + y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
-                  + y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
-                  +y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
-            sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
-                  + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
-
-        }
-        tmp[16 * ix + tid] += dall * sum1 - dmin * sum2;
-
-    }
-
-    // sum up partial sums and write back result
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for (int s=16; s>0; s>>=1) {
-        if (tid < s) {
-            tmp[tid] += tmp[tid + s];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if (tid == 0) {
-        dst[row] = tmp[0];
-    }
-}
-
-__kernel void dequantize_mul_mat_vec_q3_K(__global const struct block_q3_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
-    const uint16_t kmask1 = 0x0303;
-    const uint16_t kmask2 = 0x0f0f;
-
-    const int row = get_group_id(0);
-
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row + get_global_offset(0);
-
-    __global const struct block_q3_K * x = xx + ib0;
-
-    const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
-    const int ix  = get_local_id(0)%K_QUANTS_PER_ITERATION;  // 0 or 0,1
-
-    const int n  = K_QUANTS_PER_ITERATION;               // iterations in the inner loop
-    const int step = 16/K_QUANTS_PER_ITERATION;
-    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
-    const int in = tid - step*im;                        // 0....15 or 0...7
-
-    const uint8_t m = 1 << (4*im);
-
-    const int l0 = n*in;                                 // 0...15 or 0...14 in steps of 2
-    const int q_offset =  32*im + l0;
-    const int y_offset = 128*im + l0;
-
-    uint16_t utmp[4];
-    const int8_t * s = (const int8_t *)utmp;
-
-    const uint16_t s_shift = 4*im;
-
-    tmp[16 * ix + tid] = 0;
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        __global const float   * y  = yy + i * QK_K + y_offset;
-        __global const uint8_t * q = x[i].qs + q_offset;
-        __global const uint8_t * h = x[i].hmask + l0;
-
-        __global const uint16_t * a = (__global const uint16_t *)x[i].scales;
-        utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
-        utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
-        utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
-        utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
-
-        const float d = vload_half(0, &x[i].d);
-
-        float sum = 0;
-        for (int l = 0; l < n; ++l) {
-            sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
-                 + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
-                 + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
-                 + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
-            sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
-                 + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
-                 + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
-                + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
-        }
-        tmp[16 * ix + tid] += d * sum;
-
-    }
-
-    // sum up partial sums and write back result
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for (int s=16; s>0; s>>=1) {
-        if (tid < s) {
-            tmp[tid] += tmp[tid + s];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if (tid == 0) {
-        dst[row] = tmp[0];
-    }
-}
-
-__kernel void dequantize_mul_mat_vec_q4_K(__global const struct block_q4_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
-
-    //to rename it later, just to test now
-    const uint16_t kmask1 = 0x3f3f;
-    const uint16_t kmask2 = 0x0f0f;
-    const uint16_t kmask3 = 0xc0c0;
-
-    const int row = get_group_id(0);
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row + get_global_offset(0);
-
-    const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION;  // 0...15
-    const int ix  = get_local_id(0)%K_QUANTS_PER_ITERATION;
-
-    const int step = 8/K_QUANTS_PER_ITERATION;
-
-    const int il  = tid/step;     // 0...3
-    const int ir  = tid - step*il;// 0...3
-    const int n   = 2*K_QUANTS_PER_ITERATION;
-
-    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
-    const int in = il%2;
-
-    const int l0 = n*(2*ir + in);
-    const int q_offset = 32*im + l0;
-    const int y_offset = 64*im + l0;
-
-    uint16_t aux[4];
-    const uint8_t * sc = (const uint8_t *)aux;
-
-    __global const struct block_q4_K * x = xx + ib0;
-
-    tmp[16 * ix + tid] = 0;
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        __global const uint8_t * q1 = x[i].qs + q_offset;
-        __global const uint8_t * q2 = q1 + 64;
-        __global const float   * y1 = yy + i*QK_K + y_offset;
-        __global const float   * y2 = y1 + 128;
-
-        const float dall = vload_half(0, &x[i].d);
-        const float dmin = vload_half(0, &x[i].dmin);
-
-        __global const uint16_t * a = (__global const uint16_t *)x[i].scales;
-        aux[0] = a[im+0] & kmask1;
-        aux[1] = a[im+2] & kmask1;
-        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
-        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
-
-        float4 s = (float4)(0.f);
-        float smin = 0;
-        for (int l = 0; l < n; ++l) {
-            s.x += y1[l] * (q1[l] & 0xF); s.y += y1[l+32] * (q1[l] >> 4);
-            s.z += y2[l] * (q2[l] & 0xF); s.w += y2[l+32] * (q2[l] >> 4);
-            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
-        }
-        tmp[16 * ix + tid] += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin;
-
-    }
-
-    // sum up partial sums and write back result
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for (int s=16; s>0; s>>=1) {
-        if (tid < s) {
-            tmp[tid] += tmp[tid + s];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if (tid == 0) {
-        dst[row] = tmp[0];
-    }
-}
-
-__kernel void dequantize_mul_mat_vec_q5_K(__global const struct block_q5_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) {
-
-    const uint16_t kmask1 = 0x3f3f;
-    const uint16_t kmask2 = 0x0f0f;
-    const uint16_t kmask3 = 0xc0c0;
-
-    const int row = get_group_id(0);
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row + get_global_offset(0);
-
-    const int tid = get_local_id(0)/2;  // 0...15
-    const int ix  = get_local_id(0)%2;
-
-    const int il  = tid/4;     // 0...3
-    const int ir  = tid - 4*il;// 0...3
-    const int n   = 2;
-
-    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
-    const int in = il%2;
-
-    const int l0 = n*(2*ir + in);
-    const int q_offset = 32*im + l0;
-    const int y_offset = 64*im + l0;
-
-    const uint8_t hm1  = 1 << (2*im);
-    const uint8_t hm2  = hm1 << 4;
-
-    uint16_t aux[4];
-    const uint8_t * sc = (const uint8_t *)aux;
-
-    __global const struct block_q5_K * x = xx + ib0;
-
-    tmp[16 * ix + tid] = 0;
-
-    for (int i = ix; i < num_blocks_per_row; i += 2) {
-
-        __global const uint8_t * ql1 = x[i].qs + q_offset;
-        __global const uint8_t * ql2 = ql1 + 64;
-        __global const uint8_t * qh  = x[i].qh + l0;
-        __global const float   * y1  = yy + i*QK_K + y_offset;
-        __global const float   * y2  = y1 + 128;
-
-        const float dall = vload_half(0, &x[i].d);
-        const float dmin = vload_half(0, &x[i].dmin);
-
-        __global const uint16_t * a = (__global const uint16_t *)x[i].scales;
-        aux[0] = a[im+0] & kmask1;
-        aux[1] = a[im+2] & kmask1;
-        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
-        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
-
-        float4 sum = (float4)(0.f);
-        float smin = 0;
-        for (int l = 0; l < n; ++l) {
-            sum.x += y1[l+ 0] * ((ql1[l+ 0] & 0xF) + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
-                   + y1[l+16] * ((ql1[l+16] & 0xF) + (qh[l+16] & (hm1 << 0) ? 16 : 0));
-            sum.y += y1[l+32] * ((ql1[l+ 0] >>  4) + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
-                   + y1[l+48] * ((ql1[l+16] >>  4) + (qh[l+16] & (hm1 << 1) ? 16 : 0));
-            sum.z += y2[l+ 0] * ((ql2[l+ 0] & 0xF) + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
-                   + y2[l+16] * ((ql2[l+16] & 0xF) + (qh[l+16] & (hm2 << 0) ? 16 : 0));
-            sum.w += y2[l+32] * ((ql2[l+ 0] >>  4) + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
-                   + y2[l+48] * ((ql2[l+16] >>  4) + (qh[l+16] & (hm2 << 1) ? 16 : 0));
-            smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
-                  + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
-        }
-        tmp[16 * ix + tid] += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin;
-
-    }
-
-    // sum up partial sums and write back result
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for (int s=16; s>0; s>>=1) {
-        if (tid < s) {
-            tmp[tid] += tmp[tid + s];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if (tid == 0) {
-        dst[row] = tmp[0];
-    }
-}
-
-__kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx, __local float* tmp, __global const float * yy, __global float * dst, const int ncols) {
-
-    const int row = get_group_id(0);
-
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row + get_global_offset(0);
-
-    __global const struct block_q6_K * x = xx + ib0;
-
-    const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION;  // 0...31 or 0...16
-    const int ix  = get_local_id(0)%K_QUANTS_PER_ITERATION;  // 0 or 0, 1
-
-    const int step = 16/K_QUANTS_PER_ITERATION;          // 16 or 8
-
-    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
-    const int in = tid - step*im;                        // 0...15 or 0...7
-
-\n#if K_QUANTS_PER_ITERATION == 1\n
-    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15
-    const int is = 0;
-
-\n#else\n
-
-    const int l0 = 4 * in;                               // 0, 4, 8, ..., 28
-    const int is = in / 4;
-
-\n#endif\n
-
-    const int ql_offset = 64*im + l0;
-    const int qh_offset = 32*im + l0;
-    const int s_offset  =  8*im + is;
-    const int y_offset = 128*im + l0;
-
-    tmp[16 * ix + tid] = 0; // partial sum for thread in warp
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        __global const float   * y  = yy + i * QK_K + y_offset;
-        __global const uint8_t * ql = x[i].ql + ql_offset;
-        __global const uint8_t * qh = x[i].qh + qh_offset;
-        __global const int8_t  * s  = x[i].scales + s_offset;
-
-        const float d = vload_half(0, &x[i].d);
-
-\n#if K_QUANTS_PER_ITERATION == 1\n
-        float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
-                  + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
-                  + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
-                  + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
-                  + y[64] * s[4] * d * ((int8_t)((ql[ 0]  >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
-                  + y[80] * s[5] * d * ((int8_t)((ql[16]  >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
-                  + y[96] * s[6] * d * ((int8_t)((ql[32]  >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
-                  +y[112] * s[7] * d * ((int8_t)((ql[48]  >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
-        tmp[16 * ix + tid] += sum;
-\n#else\n
-        float sum = 0;
-        for (int l = 0; l < 4; ++l) {
-            sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
-                 + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
-                 + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
-                 + y[l+96] * s[6] * d * ((int8_t)((ql[l+32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
-        }
-        tmp[16 * ix + tid] += sum;
-\n#endif\n
-
-    }
-
-    // sum up partial sums and write back result
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for (int s=16; s>0; s>>=1) {
-        if (tid < s) {
-            tmp[tid] += tmp[tid + s];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if (tid == 0) {
-        dst[row] = tmp[0];
-    }
-}
-);
-
-
-std::string dequant_template = MULTILINE_QUOTE(
-__kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
-    const int i = get_group_id(0)*get_local_size(0) + get_local_id(0)*2;
-
-    if (i >= get_global_size(0)) {
-        return;
-    }
-
-    const uint qk = QUANT_K;
-    const uint qr = QUANT_R;
-
-    const int ib = i/qk + get_global_offset(0); // block index
-    const int iqs = (i%qk)/qr; // quant index
-    const int iybs = i - i%qk; // y block start index
-    const int y_offset = qr == 1 ? 1 : qk/2;
-
-    // dequantize
-    float v0, v1;
-    DEQUANT_FUNC(x, ib, iqs, &v0, &v1);
-    y[iybs + iqs + 0] = v0;
-    y[iybs + iqs + y_offset] = v1;
-}
-);
-
-std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
-__kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
-    const int local_size = get_local_size(0);
-    const int row = get_group_id(0);
-    const int tid = get_local_id(0);
-
-    const uint qk = QUANT_K;
-    const uint qr = QUANT_R;
-
-    const int col_step = local_size * 2;
-    const int y_offset = qr == 1 ? 1 : qk/2;
-
-    x += get_global_offset(0);
-
-    tmp[tid] = 0;
-
-    for (int col = tid*2; col < ncols; col += col_step) {
-        const int ib = (row*ncols + col)/qk; // block index
-        const int iqs = (col%qk)/qr; // quant index
-        const int iybs = col - col%qk; // y block start index
-
-        // dequantize
-        float v0, v1;
-        DEQUANT_FUNC(x, ib, iqs, &v0, &v1);
-
-        // matrix multiplication
-        tmp[tid] += v0 * y[iybs + iqs + 0];
-        tmp[tid] += v1 * y[iybs + iqs + y_offset];
-    }
-
-    // sum up partial sums and write back result
-    barrier(CLK_LOCAL_MEM_FENCE);
-    for (int s=local_size/2; s>0; s>>=1) {
-        if (tid < s) {
-            tmp[tid] += tmp[tid + s];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if (tid == 0) {
-        dst[row] = tmp[0];
-    }
-}
-
-);
-
-
-std::string mul_template = MULTILINE_QUOTE(
-__kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y, const int y_offset, __global TYPE* dst, const int dst_offset, const int ky) {
-    const int i = get_group_id(0)*get_local_size(0) + get_local_id(0);
-
-    if (i >= get_global_size(0)) {
-        return;
-    }
-
-    dst[dst_offset + i] = x[x_offset + i] * y[y_offset + i%ky];
-}
-);
-
-std::string add_template = MULTILINE_QUOTE(
-__kernel void add_f32(__global float * x, const int x_offset, __global float * y, const int y_offset, __global float * dst, const int dst_offset, const int ky) {
-    const int i = get_group_id(0)*get_local_size(0) + get_local_id(0);
-
-    if (i >= get_global_size(0)) {
-        return;
-    }
-
-    dst[dst_offset + i] = x[x_offset + i] + y[y_offset + i%ky];
-}
-);
-
-#define CL_CHECK(err)                                               \
-    do {                                                            \
-        cl_int err_ = (err);                                        \
-        if (err_ != CL_SUCCESS) {                                   \
-            fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n",  \
-                #err, err_, __FILE__, __LINE__);                    \
-            exit(1);                                                \
-        }                                                           \
-    } while (0)
-
-#define CLBLAST_CHECK(err)                                          \
-    do {                                                            \
-        CLBlastStatusCode err_ = (err);                             \
-        if (err_ != CLBlastSuccess) {                               \
-            fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n",  \
-                #err, err_, __FILE__, __LINE__);                    \
-            exit(1);                                                \
-        }                                                           \
-    } while (0)
-
-std::array<std::string, 5> dequant_str_keys = {
-    "KERNEL_NAME", "X_TYPE", "QUANT_K", "QUANT_R", "DEQUANT_FUNC"
-};
-
-std::array<std::string, 30> dequant_str_values = {
-    "dequantize_row_q4_0", "struct block_q4_0", "QK4_0", "QR4_0", "dequantize_q4_0",
-    "dequantize_row_q4_1", "struct block_q4_1", "QK4_1", "QR4_1", "dequantize_q4_1",
-    "dequantize_row_q5_0", "struct block_q5_0", "QK5_0", "QR5_0", "dequantize_q5_0",
-    "dequantize_row_q5_1", "struct block_q5_1", "QK5_1", "QR5_1", "dequantize_q5_1",
-    "dequantize_row_q8_0", "struct block_q8_0", "QK8_0", "QR8_0", "dequantize_q8_0",
-    "convert_row_f16", "half", "1", "1", "convert_f16"
-};
-
-std::array<std::string, 30> dequant_mul_mat_vec_str_values = {
-    "dequantize_mul_mat_vec_q4_0", "struct block_q4_0", "QK4_0", "QR4_0", "dequantize_q4_0",
-    "dequantize_mul_mat_vec_q4_1", "struct block_q4_1", "QK4_1", "QR4_1", "dequantize_q4_1",
-    "dequantize_mul_mat_vec_q5_0", "struct block_q5_0", "QK5_0", "QR5_0", "dequantize_q5_0",
-    "dequantize_mul_mat_vec_q5_1", "struct block_q5_1", "QK5_1", "QR5_1", "dequantize_q5_1",
-    "dequantize_mul_mat_vec_q8_0", "struct block_q8_0", "QK8_0", "QR8_0", "dequantize_q8_0",
-    "convert_mul_mat_vec_f16", "half", "1", "1", "convert_f16"
-};
-
-std::array<std::string, 2> mul_str_keys = {
-    "KERNEL_NAME", "TYPE"
-};
-std::array<std::string, 2> mul_str_values = {
-    "mul_f32", "float"
-};
-
-static std::string& replace(std::string& s, const std::string& from, const std::string& to) {
-    size_t pos = 0;
-    while ((pos = s.find(from, pos)) != std::string::npos) {
-         s.replace(pos, from.length(), to);
-         pos += to.length();
-    }
-    return s;
-}
-
-static std::string generate_kernels() {
-    std::stringstream src;
-    src << program_source << '\n';
-    src << k_quants_source << '\n';
-    for (size_t i = 0; i < dequant_str_values.size(); i += dequant_str_keys.size()) {
-        std::string dequant_kernel = dequant_template;
-        std::string dmmv_kernel = dequant_mul_mat_vec_template;
-        for (size_t j = 0; j < dequant_str_keys.size(); j++) {
-            replace(dequant_kernel, dequant_str_keys[j], dequant_str_values[i + j]);
-            replace(dmmv_kernel, dequant_str_keys[j], dequant_mul_mat_vec_str_values[i + j]);
-        }
-        src << dequant_kernel << '\n';
-        src << dmmv_kernel << '\n';
-    }
-    for (size_t i = 0; i < mul_str_values.size(); i += mul_str_keys.size()) {
-        std::string mul_kernel = mul_template;
-        for (size_t j = 0; j < mul_str_keys.size(); j++) {
-            replace(mul_kernel, mul_str_keys[j], mul_str_values[i + j]);
-        }
-        src << mul_kernel << '\n';
-    }
-    src << add_template << '\n';
-
-    return src.str();
-}
-
-static cl_platform_id platform;
-static cl_device_id device;
-static cl_context context;
-static cl_command_queue queue;
-static cl_program program;
-static cl_kernel convert_row_f16_cl;
-static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl;
-static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl;
-static cl_kernel dequantize_block_q2_k_cl, dequantize_block_q3_k_cl, dequantize_block_q4_k_cl, dequantize_block_q5_k_cl, dequantize_block_q6_k_cl;
-static cl_kernel dequantize_mul_mat_vec_q2_K_cl, dequantize_mul_mat_vec_q3_K_cl, dequantize_mul_mat_vec_q4_K_cl, dequantize_mul_mat_vec_q5_K_cl, dequantize_mul_mat_vec_q6_K_cl;
-static cl_kernel mul_f32_cl;
-static cl_kernel add_f32_cl;
-static bool fp16_support;
-
-static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
-    cl_program p;
-    char *program_log;
-    size_t program_size;
-    size_t log_size;
-    int err;
-
-    program_size = strlen(program_buffer);
-
-    p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err);
-    if(err < 0) {
-        fprintf(stderr, "OpenCL error creating program");
-        exit(1);
-    }
-
-    std::string compile_opts = "-cl-mad-enable -cl-unsafe-math-optimizations -cl-finite-math-only -cl-fast-relaxed-math "
-                               "-DQK4_0=32 -DQR4_0=2 -DQK4_1=32 -DQR4_1=2 -DQK5_0=32 -DQR5_0=2 -DQK5_1=32 -DQR5_1=2 -DQK8_0=32 -DQR8_0=1 "
-                               "-DQK_K=256 -DK_QUANTS_PER_ITERATION=" + std::to_string(K_QUANTS_PER_ITERATION);
-
-    err = clBuildProgram(p, 0, NULL, compile_opts.c_str(), NULL, NULL);
-    if(err < 0) {
-
-        clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
-        program_log = (char*) malloc(log_size + 1);
-        program_log[log_size] = '\0';
-        clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL);
-        fprintf(stderr, "ggml_opencl: kernel compile error:\n\n%s\n", program_log);
-        free(program_log);
-        exit(1);
-    }
-
-    return p;
-}
-
-void ggml_cl_init(void) {
-    static bool initialized = false;
-    if (initialized) {
-        return;
-    }
-    initialized = true;
-
-    cl_int err;
-
-    struct cl_device;
-    struct cl_platform {
-        cl_platform_id id;
-        unsigned number;
-        char name[128];
-        char vendor[128];
-        struct cl_device * devices;
-        unsigned n_devices;
-        struct cl_device * default_device;
-    };
-
-    struct cl_device {
-        struct cl_platform * platform;
-        cl_device_id id;
-        unsigned number;
-        cl_device_type type;
-        char name[128];
-    };
-
-    enum { NPLAT = 16, NDEV = 16 };
-
-    struct cl_platform platforms[NPLAT];
-    unsigned n_platforms = 0;
-    struct cl_device devices[NDEV];
-    unsigned n_devices = 0;
-    struct cl_device * default_device = NULL;
-
-    platform = NULL;
-    device = NULL;
-
-    cl_platform_id platform_ids[NPLAT];
-    CL_CHECK(clGetPlatformIDs(NPLAT, platform_ids, &n_platforms));
-
-    for (unsigned i = 0; i < n_platforms; i++) {
-        struct cl_platform * p = &platforms[i];
-        p->number = i;
-        p->id = platform_ids[i];
-        CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_NAME, sizeof(p->name), &p->name, NULL));
-        CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_VENDOR, sizeof(p->vendor), &p->vendor, NULL));
-
-        cl_device_id device_ids[NDEV];
-        cl_int clGetDeviceIDsError = clGetDeviceIDs(p->id, CL_DEVICE_TYPE_ALL, NDEV, device_ids, &p->n_devices);
-        if (clGetDeviceIDsError == CL_DEVICE_NOT_FOUND) {
-            p->n_devices = 0;
-        } else {
-            CL_CHECK(clGetDeviceIDsError);
-        }
-        p->devices = p->n_devices > 0 ? &devices[n_devices] : NULL;
-        p->default_device = NULL;
-
-        for (unsigned j = 0; j < p->n_devices; j++) {
-            struct cl_device * d = &devices[n_devices];
-            d->number = n_devices++;
-            d->id = device_ids[j];
-            d->platform = p;
-            CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_NAME, sizeof(d->name), &d->name, NULL));
-            CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_TYPE, sizeof(d->type), &d->type, NULL));
-
-            if (p->default_device == NULL && d->type == CL_DEVICE_TYPE_GPU) {
-                p->default_device = d;
-            }
-        }
-
-        if (default_device == NULL && p->default_device != NULL) {
-            default_device = p->default_device;
-        }
-    }
-
-    if (n_devices == 0) {
-        fprintf(stderr, "ggml_opencl: could find any OpenCL devices.\n");
-        exit(1);
-    }
-
-    char * user_platform_string = getenv("GGML_OPENCL_PLATFORM");
-    char * user_device_string = getenv("GGML_OPENCL_DEVICE");
-    int user_platform_number = -1;
-    int user_device_number = -1;
-
-    unsigned n;
-    if (user_platform_string != NULL && sscanf(user_platform_string, " %u", &n) == 1 && n < n_platforms) {
-        user_platform_number = (int)n;
-    }
-    if (user_device_string != NULL && sscanf(user_device_string, " %u", &n) == 1 && n < n_devices) {
-        user_device_number = (int)n;
-    }
-    if (user_platform_number != -1 && user_device_number != -1) {
-        cl_platform* platform = &platforms[user_platform_number];
-        if ((unsigned)user_device_number >= platform->n_devices) {
-            fprintf(stderr, "ggml_opencl: invalid device number %d\n", user_device_number);
-            exit(1);
-        }
-        default_device = &platform->devices[user_device_number];
-    } else {
-
-        struct cl_device * selected_devices = devices;
-        unsigned n_selected_devices = n_devices;
-
-        if (user_platform_number == -1 && user_platform_string != NULL && user_platform_string[0] != 0) {
-            for (unsigned i = 0; i < n_platforms; i++) {
-                struct cl_platform * p = &platforms[i];
-                if (strstr(p->name, user_platform_string) != NULL ||
-                    strstr(p->vendor, user_platform_string) != NULL) {
-                    user_platform_number = (int)i;
-                    break;
-                }
-            }
-            if (user_platform_number == -1) {
-                fprintf(stderr, "ggml_opencl: no platform matching '%s' was found.\n", user_platform_string);
-                exit(1);
-            }
-        }
-        if (user_platform_number != -1) {
-            struct cl_platform * p = &platforms[user_platform_number];
-            selected_devices = p->devices;
-            n_selected_devices = p->n_devices;
-            default_device = p->default_device;
-            if (n_selected_devices == 0) {
-                fprintf(stderr, "ggml_opencl: selected platform '%s' does not have any devices.\n", p->name);
-                exit(1);
-            }
-        }
-
-        if (user_device_number == -1 && user_device_string != NULL && user_device_string[0] != 0) {
-            for (unsigned i = 0; i < n_selected_devices; i++) {
-                struct cl_device * d = &selected_devices[i];
-                if (strstr(d->name, user_device_string) != NULL) {
-                    user_device_number = d->number;
-                    break;
-                }
-            }
-            if (user_device_number == -1) {
-                fprintf(stderr, "ggml_opencl: no device matching '%s' was found.\n", user_device_string);
-                exit(1);
-            }
-        }
-        if (user_device_number != -1) {
-            selected_devices = &devices[user_device_number];
-            n_selected_devices = 1;
-            default_device = &selected_devices[0];
-        }
-
-        GGML_ASSERT(n_selected_devices > 0);
-
-        if (default_device == NULL) {
-            default_device = &selected_devices[0];
-        }
-    }
-
-    fprintf(stderr, "ggml_opencl: selecting platform: '%s'\n", default_device->platform->name);
-    fprintf(stderr, "ggml_opencl: selecting device: '%s'\n", default_device->name);
-    if (default_device->type != CL_DEVICE_TYPE_GPU) {
-        fprintf(stderr, "ggml_opencl: warning, not a GPU: '%s'.\n", default_device->name);
-    }
-
-    platform = default_device->platform->id;
-    device = default_device->id;
-
-    size_t ext_str_size;
-    clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size);
-    char *ext_buffer = (char *)alloca(ext_str_size + 1);
-    clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
-    ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
-    // Disabled due to faulty outputs
-    // Check if ext_buffer contains cl_khr_fp16
-    fp16_support = false;  // strstr(ext_buffer, "cl_khr_fp16") != NULL;
-    // fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false");
-
-    cl_context_properties properties[] = {
-        (intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)platform, 0
-    };
-
-    CL_CHECK((context = clCreateContext(properties, 1, &device, NULL, NULL, &err), err));
-
-    CL_CHECK((queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err),
-        (err != CL_INVALID_QUEUE_PROPERTIES && err != CL_INVALID_VALUE ? err :
-        (queue = clCreateCommandQueue(context, device, 0, &err), err)
-    )));
-
-    const std::string kernel_src = generate_kernels();
-
-    program = build_program_from_source(context, device, kernel_src.c_str());
-
-    // FP16 to FP32 kernel
-    CL_CHECK((convert_row_f16_cl = clCreateKernel(program, "convert_row_f16", &err), err));
-
-    // Dequantize kernels
-    CL_CHECK((dequantize_row_q4_0_cl = clCreateKernel(program, "dequantize_row_q4_0", &err), err));
-    CL_CHECK((dequantize_row_q4_1_cl = clCreateKernel(program, "dequantize_row_q4_1", &err), err));
-    CL_CHECK((dequantize_row_q5_0_cl = clCreateKernel(program, "dequantize_row_q5_0", &err), err));
-    CL_CHECK((dequantize_row_q5_1_cl = clCreateKernel(program, "dequantize_row_q5_1", &err), err));
-    CL_CHECK((dequantize_row_q8_0_cl = clCreateKernel(program, "dequantize_row_q8_0", &err), err));
-    CL_CHECK((dequantize_row_q8_0_cl = clCreateKernel(program, "dequantize_row_q8_0", &err), err));
-    CL_CHECK((dequantize_block_q2_k_cl = clCreateKernel(program, "dequantize_block_q2_K", &err), err));
-    CL_CHECK((dequantize_block_q3_k_cl = clCreateKernel(program, "dequantize_block_q3_K", &err), err));
-    CL_CHECK((dequantize_block_q4_k_cl = clCreateKernel(program, "dequantize_block_q4_K", &err), err));
-    CL_CHECK((dequantize_block_q5_k_cl = clCreateKernel(program, "dequantize_block_q5_K", &err), err));
-    CL_CHECK((dequantize_block_q6_k_cl = clCreateKernel(program, "dequantize_block_q6_K", &err), err));
-
-    // dequant mul mat kernel
-    CL_CHECK((dequantize_mul_mat_vec_q4_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_0", &err), err));
-    CL_CHECK((dequantize_mul_mat_vec_q4_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_1", &err), err));
-    CL_CHECK((dequantize_mul_mat_vec_q5_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_0", &err), err));
-    CL_CHECK((dequantize_mul_mat_vec_q5_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_1", &err), err));
-    CL_CHECK((dequantize_mul_mat_vec_q8_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q8_0", &err), err));
-    CL_CHECK((convert_mul_mat_vec_f16_cl = clCreateKernel(program, "convert_mul_mat_vec_f16", &err), err));
-    CL_CHECK((dequantize_mul_mat_vec_q2_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q2_K", &err), err));
-    CL_CHECK((dequantize_mul_mat_vec_q3_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q3_K", &err), err));
-    CL_CHECK((dequantize_mul_mat_vec_q4_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_K", &err), err));
-    CL_CHECK((dequantize_mul_mat_vec_q5_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_K", &err), err));
-    CL_CHECK((dequantize_mul_mat_vec_q6_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q6_K", &err), err));
-
-    // mul kernel
-    CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err));
-
-    CL_CHECK((add_f32_cl = clCreateKernel(program, "add_f32", &err), err));
-}
-
-static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) {
-    switch (type) {
-        case GGML_TYPE_Q4_0:
-            return &dequantize_row_q4_0_cl;
-        case GGML_TYPE_Q4_1:
-            return &dequantize_row_q4_1_cl;
-        case GGML_TYPE_Q5_0:
-            return &dequantize_row_q5_0_cl;
-        case GGML_TYPE_Q5_1:
-            return &dequantize_row_q5_1_cl;
-        case GGML_TYPE_Q8_0:
-            return &dequantize_row_q8_0_cl;
-        case GGML_TYPE_Q2_K:
-            return &dequantize_block_q2_k_cl;
-        case GGML_TYPE_Q3_K:
-            return &dequantize_block_q3_k_cl;
-        case GGML_TYPE_Q4_K:
-            return &dequantize_block_q4_k_cl;
-        case GGML_TYPE_Q5_K:
-            return &dequantize_block_q5_k_cl;
-        case GGML_TYPE_Q6_K:
-            return &dequantize_block_q6_k_cl;
-        case GGML_TYPE_F16:
-            return &convert_row_f16_cl;
-        default:
-            return nullptr;
-    }
-}
-
-static size_t ggml_cl_global_denom(ggml_type type) {
-    switch (type) {
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-            return 1;
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-            return 4;
-        case GGML_TYPE_Q4_K:
-            return 8;
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-            return 4;
-        case GGML_TYPE_F16:
-        default:
-            return 1;
-    }
-}
-
-static size_t ggml_cl_local_size(ggml_type type) {
-    switch (type) {
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-            return 0;
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-            return 64;
-        case GGML_TYPE_Q4_K:
-            return 32;
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-            return 64;
-        case GGML_TYPE_F16:
-        default:
-            return 0;
-    }
-}
-
-static cl_kernel* ggml_get_dequantize_mul_mat_vec_cl(ggml_type type) {
-    switch (type) {
-        case GGML_TYPE_Q4_0:
-            return &dequantize_mul_mat_vec_q4_0_cl;
-        case GGML_TYPE_Q4_1:
-            return &dequantize_mul_mat_vec_q4_1_cl;
-        case GGML_TYPE_Q5_0:
-            return &dequantize_mul_mat_vec_q5_0_cl;
-        case GGML_TYPE_Q5_1:
-            return &dequantize_mul_mat_vec_q5_1_cl;
-        case GGML_TYPE_Q8_0:
-            return &dequantize_mul_mat_vec_q8_0_cl;
-        case GGML_TYPE_F16:
-            return &convert_mul_mat_vec_f16_cl;
-        case GGML_TYPE_Q2_K:
-            return &dequantize_mul_mat_vec_q2_K_cl;
-        case GGML_TYPE_Q3_K:
-            return &dequantize_mul_mat_vec_q3_K_cl;
-        case GGML_TYPE_Q4_K:
-            return &dequantize_mul_mat_vec_q4_K_cl;
-        case GGML_TYPE_Q5_K:
-            return &dequantize_mul_mat_vec_q5_K_cl;
-        case GGML_TYPE_Q6_K:
-            return &dequantize_mul_mat_vec_q6_K_cl;
-        default:
-            return nullptr;
-    }
-}
-
-// buffer pool for cl
-#define MAX_CL_BUFFERS 256
-
-struct scoped_spin_lock {
-    std::atomic_flag& lock;
-    scoped_spin_lock(std::atomic_flag& lock) : lock(lock) {
-        while (lock.test_and_set(std::memory_order_acquire)) {
-            ; // spin
-        }
-    }
-    ~scoped_spin_lock() {
-        lock.clear(std::memory_order_release);
-    }
-    scoped_spin_lock(const scoped_spin_lock&) = delete;
-    scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
-};
-
-struct cl_buffer {
-    cl_mem mem;
-    size_t size = 0;
-};
-
-static cl_buffer g_cl_buffer_pool[MAX_CL_BUFFERS];
-static std::atomic_flag g_cl_pool_lock = ATOMIC_FLAG_INIT;
-
-static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size) {
-    scoped_spin_lock lock(g_cl_pool_lock);
-    cl_int err;
-
-    int best_i = -1;
-    size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs
-    int worst_i = -1;
-    size_t worst_size = 0; //largest unused buffer seen so far
-    for (int i = 0; i < MAX_CL_BUFFERS; ++i) {
-        cl_buffer &b = g_cl_buffer_pool[i];
-        if (b.size > 0 && b.size >= size && b.size < best_size)
-        {
-            best_i = i;
-            best_size = b.size;
-        }
-        if (b.size > 0 && b.size > worst_size)
-        {
-            worst_i = i;
-            worst_size = b.size;
-        }
-    }
-    if(best_i!=-1) //found the smallest buffer that fits our needs
-    {
-        cl_buffer& b = g_cl_buffer_pool[best_i];
-        cl_mem mem = b.mem;
-        *actual_size = b.size;
-        b.size = 0;
-        return mem;
-    }
-    if(worst_i!=-1) //no buffer that fits our needs, resize largest one to save memory
-    {
-         cl_buffer& b = g_cl_buffer_pool[worst_i];
-         cl_mem mem = b.mem;
-         b.size = 0;
-         clReleaseMemObject(mem);
-    }
-    cl_mem mem;
-    CL_CHECK((mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err), err));
-    *actual_size = size;
-    return mem;
-}
-
-static void ggml_cl_pool_free(cl_mem mem, size_t size) {
-    scoped_spin_lock lock(g_cl_pool_lock);
-
-    for (int i = 0; i < MAX_CL_BUFFERS; ++i) {
-        cl_buffer& b = g_cl_buffer_pool[i];
-        if (b.size == 0) {
-            b.mem = mem;
-            b.size = size;
-            return;
-        }
-    }
-    fprintf(stderr, "WARNING: cl buffer pool full, increase MAX_CL_BUFFERS\n");
-    clReleaseMemObject(mem);
-}
-
-void ggml_cl_free_data(const struct ggml_tensor* tensor) {
-    if (tensor->backend != GGML_BACKEND_TYPE_GPU) {
-        return;
-    }
-
-    cl_mem mem = (cl_mem)tensor->extra;
-    clReleaseMemObject(mem);
-}
-
-static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t offset, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cl_event* ev) {
-    cl_int err;
-    const uint64_t ne0 = src->ne[0];
-    const uint64_t ne1 = src->ne[1];
-    const uint64_t nb0 = src->nb[0];
-    const uint64_t nb1 = src->nb[1];
-    const uint64_t nb2 = src->nb[2];
-    const uint64_t nb3 = src->nb[3];
-    const enum ggml_type type = src->type;
-    const size_t ts = ggml_type_size(type);
-    const size_t bs = ggml_blck_size(type);
-    const uint64_t row_size = ts*ne0/bs;
-
-    const char * x = (const char *) src->data + i2*nb2 + i3*nb3;
-    if (nb0 == ts && nb1 == row_size) {
-        return clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*row_size, x, 0, NULL, ev);
-    }
-    if (nb0 == ts) {
-        const size_t buffer_origin[3] = { offset, 0, 0 };
-        const size_t host_origin[3] = { 0, 0, 0 };
-        const size_t region[3] = { row_size, ne1, 1 };
-        return clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, row_size, 0, nb1, 0, x, 0, NULL, ev);
-    }
-    std::vector<cl_event> events;
-    if (ev && ne1>1) events.reserve(ne1-1);
-    for (uint64_t i1 = 0; i1 < ne1; i1++) {
-        // pretend the row is a matrix with cols=1
-        const size_t buffer_origin[3] = { offset + i1*row_size, 0, 0 };
-        const size_t host_origin[3] = { 0, 0, 0 };
-        const size_t region[3] = { ts, ne0/bs, 1 };
-        // if an event is requested, make the last write wait for all previous writes to complete
-        if (ev && i1) {
-            events.push_back(*ev);
-        }
-        cl_uint nevents = i1 == ne1-1 ? events.size() : 0U;
-        err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts, 0, nb0, 0, x + i1*nb1, nevents, nevents ? events.data() : nullptr, ev);
-        if (err != CL_SUCCESS) {
-            for (auto event : events) {
-                clReleaseEvent(event);
-            }
-            return err;
-        }
-    }
-    for (auto event : events) {
-        CL_CHECK(clReleaseEvent(event));
-    }
-    return CL_SUCCESS;
-}
-
-static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-    const int nb2  = dst->nb[2];
-    const int nb3  = dst->nb[3];
-    size_t x_size;
-    size_t d_size;
-
-    cl_mem d_X = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &x_size); // src0
-    cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
-    cl_mem d_D = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &d_size); // dst
-
-
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            cl_event ev;
-
-            // copy src0 to device
-            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, &ev));
-
-            const int64_t i13 = i03%ne13;
-            const int64_t i12 = i02%ne12;
-            const int i1 = i13*ne12*ne11 + i12*ne11;
-
-            cl_int x_offset = 0;
-            cl_int y_offset = i1*ne10;
-            cl_int d_offset = 0;
-
-            size_t global = ne00 * ne01;
-            cl_int ky = ne10 * ne11;
-
-            CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X));
-            CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset));
-            CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y));
-            CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset));
-            CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D));
-            CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset));
-            CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky));
-            CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
-
-            CL_CHECK(clReleaseEvent(ev));
-            CL_CHECK(clFinish(queue));
-
-            // copy dst to host
-            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
-            CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * ne00*ne01, d, 0, NULL, NULL));
-        }
-    }
-    ggml_cl_pool_free(d_X, x_size);
-    ggml_cl_pool_free(d_D, d_size);
-}
-
-void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
-    GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
-    ggml_cl_mul_f32(src0, src1, dst);
-}
-
-static void ggml_cl_add_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-    const int nb2  = dst->nb[2];
-    const int nb3  = dst->nb[3];
-    size_t x_size;
-    size_t d_size;
-
-    cl_mem d_X = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &x_size); // src0
-    cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
-    cl_mem d_D = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &d_size); // dst
-
-
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            cl_event ev;
-
-            // copy src0 to device
-            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, &ev));
-
-            const int64_t i13 = i03%ne13;
-            const int64_t i12 = i02%ne12;
-            const int i1 = i13*ne12*ne11 + i12*ne11;
-
-            cl_int x_offset = 0;
-            cl_int y_offset = i1*ne10;
-            cl_int d_offset = 0;
-
-            size_t global = ne00 * ne01;
-            cl_int ky = ne10 * ne11;
-
-            CL_CHECK(clSetKernelArg(add_f32_cl, 0, sizeof(cl_mem), &d_X));
-            CL_CHECK(clSetKernelArg(add_f32_cl, 1, sizeof(cl_int), &x_offset));
-            CL_CHECK(clSetKernelArg(add_f32_cl, 2, sizeof(cl_mem), &d_Y));
-            CL_CHECK(clSetKernelArg(add_f32_cl, 3, sizeof(cl_int), &y_offset));
-            CL_CHECK(clSetKernelArg(add_f32_cl, 4, sizeof(cl_mem), &d_D));
-            CL_CHECK(clSetKernelArg(add_f32_cl, 5, sizeof(cl_int), &d_offset));
-            CL_CHECK(clSetKernelArg(add_f32_cl, 6, sizeof(cl_int), &ky));
-            CL_CHECK(clEnqueueNDRangeKernel(queue, add_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
-
-            CL_CHECK(clReleaseEvent(ev));
-            CL_CHECK(clFinish(queue));
-
-            // copy dst to host
-            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
-            CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * ne00*ne01, d, 0, NULL, NULL));
-        }
-    }
-    ggml_cl_pool_free(d_X, x_size);
-    ggml_cl_pool_free(d_D, d_size);
-}
-
-void ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
-    GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
-    ggml_cl_add_f32(src0, src1, dst);
-}
-
-static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-
-    const int nb2  = dst->nb[2];
-    const int nb3  = dst->nb[3];
-
-    const int64_t r2 = ne12 / ne02;
-    const int64_t r3 = ne13 / ne03;
-
-    const float alpha = 1.0f;
-    const float beta = 0.0f;
-    const int x_ne = ne01 * ne00;
-    const int y_ne = ne11 * ne10;
-    const int d_ne = ne11 * ne01;
-
-    size_t x_size;
-    size_t y_size;
-    size_t d_size;
-    cl_mem d_X;
-    if (src0->backend == GGML_BACKEND_TYPE_GPU) { // NOLINT
-        d_X = (cl_mem) src0->extra;
-    } else {
-        d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
-    }
-    cl_mem d_Y = src1->backend == GGML_BACKEND_TYPE_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
-    cl_mem d_D =  dst->backend == GGML_BACKEND_TYPE_GPU ? (cl_mem)  dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
-
-    size_t x_offset = 0;
-
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        // TODO: copy src0 here when r3>1
-        for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                if (src0->backend == GGML_BACKEND_TYPE_GPU) {
-                    x_offset = (i03 * ne02 + i02) * x_ne;
-                } else {
-                    // copy src0 to device
-                    CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
-                }
-
-                for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
-                    // copy src1 to device
-                    if (src1->backend == GGML_BACKEND_TYPE_CPU) {
-                        CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
-                    }
-
-                    CL_CHECK(clFinish(queue));
-
-                    // compute
-                    cl_event ev_sgemm;
-                    clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
-                                                               clblast::Transpose::kYes, clblast::Transpose::kNo,
-                                                               ne01, ne11, ne10,
-                                                               alpha,
-                                                               d_X, x_offset, ne00,
-                                                               d_Y, 0, ne10,
-                                                               beta,
-                                                               d_D, 0, ne01,
-                                                               &queue, &ev_sgemm);
-
-                    if (status != clblast::StatusCode::kSuccess) {
-                        GGML_ASSERT(false);
-                    }
-
-                    // copy dst to host
-                    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
-                        float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
-                        CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
-                    }
-                }
-            }
-        }
-    }
-
-    if (src0->backend != GGML_BACKEND_TYPE_GPU) {
-        ggml_cl_pool_free(d_X, x_size);
-    }
-    if (src1->backend != GGML_BACKEND_TYPE_GPU) {
-        ggml_cl_pool_free(d_Y, y_size);
-    }
-    if (dst->backend != GGML_BACKEND_TYPE_GPU) {
-        ggml_cl_pool_free(d_D, d_size);
-    }
-}
-
-static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) {
-    GGML_ASSERT(fp16_support);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-
-    const int nb10 = src1->nb[0];
-    const int nb11 = src1->nb[1];
-    const int nb12 = src1->nb[2];
-    const int nb13 = src1->nb[3];
-
-    const int nb2  = dst->nb[2];
-    const int nb3  = dst->nb[3];
-
-    const int64_t r2 = ne12 / ne02;
-    const int64_t r3 = ne13 / ne03;
-
-    const ggml_fp16_t alpha = ggml_fp32_to_fp16(1.0f);
-    const ggml_fp16_t beta = ggml_fp32_to_fp16(0.0f);
-    const int x_ne = ne01 * ne00;
-    const int y_ne = ne11 * ne10;
-    const int d_ne = ne11 * ne01;
-
-    GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * y_ne);
-    GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * d_ne);
-    ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata;
-
-    size_t x_size;
-    size_t y_size;
-    size_t d_size;
-    cl_mem d_X;
-    if (src0->backend == GGML_BACKEND_TYPE_GPU) { // NOLINT
-        d_X = (cl_mem) src0->extra;
-    } else {
-        d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
-    }
-    cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size);
-    cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size);
-
-    bool src1_cont_rows = nb10 == sizeof(float);
-    bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
-
-    size_t x_offset = 0;
-
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        // TODO: copy src0 here when r3>1
-        for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                if (src0->backend == GGML_BACKEND_TYPE_GPU) {
-                    x_offset = (i03 * ne02 + i02) * x_ne;
-                } else {
-                    // copy src0 to device
-                    CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
-                }
-
-                // FIXME: convert on device
-
-                for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
-                    // convert src1 to fp16
-                    // TODO: use multiple threads
-                    char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
-                    if (src1_cont_rows) {
-                        if (src1_cont_cols) {
-                            ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
-                        }
-                        else {
-                            for (int64_t i11 = 0; i11 < ne11; i11++) {
-                                ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
-                            }
-                        }
-                    }
-                    else {
-                        for (int64_t i11 = 0; i11 < ne11; i11++) {
-                            for (int64_t i10 = 0; i10 < ne10; i10++) {
-                                // very slow due to no inlining
-                                tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
-                            }
-                        }
-                    }
-
-                    // copy src1 to device
-                    CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL));
-
-                    CL_CHECK(clFinish(queue));
-
-                    // compute
-                    cl_event ev_sgemm;
-                    clblast::StatusCode status = clblast::Gemm<cl_half>(clblast::Layout::kColMajor,
-                                                               clblast::Transpose::kYes, clblast::Transpose::kNo,
-                                                               ne01, ne11, ne10,
-                                                               alpha,
-                                                               d_X, x_offset, ne00,
-                                                               d_Y, 0, ne10,
-                                                               beta,
-                                                               d_D, 0, ne01,
-                                                               &queue, &ev_sgemm);
-
-                    if (status != clblast::StatusCode::kSuccess) {
-                        GGML_ASSERT(false);
-                    }
-
-                    // copy dst to host, then convert to float
-                    if (dst->backend == GGML_BACKEND_TYPE_CPU) {
-                        CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
-                        float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
-                        ggml_fp16_to_fp32_row(tmp, d, d_ne);
-                    } else {
-                        // FIXME: convert dst to fp32 on device
-                    }
-                }
-            }
-        }
-    }
-
-    if (src0->backend != GGML_BACKEND_TYPE_GPU) {
-        ggml_cl_pool_free(d_X, x_size);
-    }
-    ggml_cl_pool_free(d_Y, y_size);
-    ggml_cl_pool_free(d_D, d_size);
-}
-
-static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-
-    const int nb2  = dst->nb[2];
-    const int nb3  = dst->nb[3];
-    const ggml_type type = src0->type;
-    const bool mul_mat_vec = ne11 == 1 && ne00%2 == 0;
-
-    const int64_t r2 = ne12 / ne02;
-    const int64_t r3 = ne13 / ne03;
-
-    const float alpha = 1.0f;
-    const float beta = 0.0f;
-    const int x_ne = ne01 * ne00;
-    const int y_ne = ne11 * ne10;
-    const int d_ne = ne11 * ne01;
-    const int x_bps = x_ne / ggml_blck_size(type); // blocks per 2D slice
-    const size_t q_sz = ggml_type_size(type) * x_bps;
-
-    size_t x_size;
-    size_t y_size;
-    size_t d_size;
-    size_t q_size;
-    cl_mem d_X;
-    if (!mul_mat_vec) {
-        d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
-    }
-    cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
-    cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
-    cl_mem d_Q;
-    if (src0->backend == GGML_BACKEND_TYPE_CPU) {
-        d_Q = ggml_cl_pool_malloc(q_sz, &q_size);
-    }
-
-    cl_kernel* to_fp32_cl = ggml_get_to_fp32_cl(type);
-    cl_kernel* dmmv = ggml_get_dequantize_mul_mat_vec_cl(type);
-    GGML_ASSERT(to_fp32_cl != nullptr);
-
-    const size_t global_denom = ggml_cl_global_denom(type);
-    const size_t local = mul_mat_vec ? CL_DMMV_LOCAL_SIZE : ggml_cl_local_size(type);
-
-    size_t ev_idx = 0;
-    std::vector<cl_event> events;
-
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        // TODO: copy and dequantize src0 here when r3>1
-        for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                // copy src0 to device if necessary
-                if (src0->backend == GGML_BACKEND_TYPE_CPU) {
-                    events.emplace_back();
-                    CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
-                } else if (src0->backend == GGML_BACKEND_TYPE_GPU) {
-                    d_Q = (cl_mem) src0->extra;
-                } else {
-                    GGML_ASSERT(false);
-                }
-
-                if (!mul_mat_vec) {
-                    // convert src0 to fp32 on device
-                    const size_t global = x_ne / global_denom;
-                    const size_t offset = src0->backend == GGML_BACKEND_TYPE_GPU ? (i03 * ne02 + i02) * x_bps : 0;
-                    CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
-                    CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
-                    CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
-                }
-
-                int64_t i12 = i02 * r2;
-                int64_t e12 = i12 + r2;
-                events.reserve(e12 - i12);
-                for (; i12 < e12; i12++) {
-                    if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
-                        // copy src1 to device
-                        events.emplace_back();
-                        CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
-
-                        // compute
-                        const size_t global = ne01 * local;
-                        const size_t offset = src0->backend == GGML_BACKEND_TYPE_GPU ? (i03 * ne02 + i02) * x_bps : 0;
-                        const cl_int ncols = ne00;
-                        events.emplace_back();
-                        CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
-                        CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
-                        CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
-                        CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
-                        CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
-                        CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
-                    } else { // CLBlast matrix matrix multiplication
-                        // copy src1 to device
-                        CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
-
-                        // wait for conversion
-                        CL_CHECK(clFinish(queue));
-
-                        // compute
-                        events.emplace_back();
-                        clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
-                                                                   clblast::Transpose::kYes, clblast::Transpose::kNo,
-                                                                   ne01, ne11, ne10,
-                                                                   alpha,
-                                                                   d_X, 0, ne00,
-                                                                   d_Y, 0, ne10,
-                                                                   beta,
-                                                                   d_D, 0, ne01,
-                                                                   &queue, events.data() + ev_idx++);
-
-                        if (status != clblast::StatusCode::kSuccess) {
-                            GGML_ASSERT(false);
-                        }
-                    }
-
-                    // copy dst to host
-                    float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
-                    CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
-                    for (auto *event : events) {
-                        clReleaseEvent(event);
-                    }
-
-                    ev_idx = 0;
-                    events.clear();
-                }
-            }
-        }
-    }
-
-    if (!mul_mat_vec) {
-        ggml_cl_pool_free(d_X, x_size);
-    }
-    ggml_cl_pool_free(d_Y, y_size);
-    ggml_cl_pool_free(d_D, d_size);
-    if (src0->backend == GGML_BACKEND_TYPE_CPU) {
-        ggml_cl_pool_free(d_Q, q_size);
-    }
-}
-
-
-bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst) {
-    const int64_t ne10 = src1->ne[0];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-
-    // TODO: find the optimal values for these
-    if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
-        src1->type == GGML_TYPE_F32 &&
-        dst->type == GGML_TYPE_F32 &&
-        ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU)) {
-        return true;
-    }
-
-    return false;
-}
-
-static bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) {
-    // If device doesn't support FP16
-    if (!fp16_support) {
-        return false;
-    }
-
-    size_t src0_sz = ggml_nbytes(src0);
-    size_t src1_sz = ggml_nbytes(src1);
-
-    // mul_mat_q: src0 is converted to fp32 on device
-    size_t mul_mat_q_transfer = src0_sz + src1_sz;
-
-    // mul_mat_f16: src1 is converted to fp16 on cpu
-    size_t mul_mat_f16_transfer = src0_sz + sizeof(ggml_fp16_t) * ggml_nelements(src1);
-
-    // choose the smaller one to transfer to the device
-    // TODO: this is not always the best choice due to the overhead of converting to fp16
-    return mul_mat_f16_transfer < mul_mat_q_transfer;
-}
-
-void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize) {
-    GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst));
-
-    if (src0->type == GGML_TYPE_F32) {
-        ggml_cl_mul_mat_f32(src0, src1, dst);
-    }
-    else if (src0->type == GGML_TYPE_F16) {
-        if (ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
-            ggml_cl_mul_mat_f16(src0, src1, dst, wdata, wsize);
-        }
-        else {
-            ggml_cl_mul_mat_q_f32(src0, src1, dst);
-        }
-    }
-    else if (ggml_is_quantized(src0->type)) {
-        ggml_cl_mul_mat_q_f32(src0, src1, dst);
-    }
-    else {
-        GGML_ASSERT(false);
-    }
-}
-
-size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
-    if (src0->type == GGML_TYPE_F16 && ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
-        return sizeof(ggml_fp16_t) * std::max(src1->ne[0] * src1->ne[1], dst->ne[0] * dst->ne[1]);
-    }
-    return 0;
-}
-
-void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
-    const int64_t ne0 = tensor->ne[0];
-    const int64_t ne1 = tensor->ne[1];
-    const int64_t ne2 = tensor->ne[2];
-    const int64_t ne3 = tensor->ne[3];
-
-    const ggml_type type = tensor->type;
-    const size_t s_sz = ggml_type_size(type) * (size_t) (ne0 * ne1 / ggml_blck_size(type));
-    const size_t q_sz = s_sz * (size_t) (ne2 * ne3);
-
-    size_t q_size;
-    cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size);
-
-    tensor->data = data;
-    // copy tensor to device
-    size_t offset = 0;
-    for (int64_t i3 = 0; i3 < ne3; i3++) {
-        for (int64_t i2 = 0; i2 < ne2; i2++) {
-            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, offset, tensor, i3, i2, NULL));
-            offset += s_sz;
-        }
-    }
-
-    CL_CHECK(clFinish(queue));
-
-    tensor->extra = dst;
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU);
-}
-
-// ggml-backend
-
-// buffer
-
-struct ggml_backend_opencl_buffer_context {
-    ~ggml_backend_opencl_buffer_context() {
-        if (buffer) {
-            clReleaseMemObject(buffer);
-        }
-        for (auto * sub_buffer : sub_buffers) {
-            clReleaseMemObject(sub_buffer);
-        }
-    }
-
-    cl_mem buffer;
-    std::vector<cl_mem> sub_buffers;
-};
-
-static void * const cl_ptr_base = (void *)(uintptr_t) 0x1000;
-
-static const char * ggml_backend_opencl_buffer_get_name(ggml_backend_buffer_t buffer) {
-    return "OpenCL";
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
-    delete ctx;
-}
-
-static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
-    return cl_ptr_base;
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
-    if (tensor->view_src != NULL && tensor->view_offs == 0) {
-        tensor->extra = tensor->view_src->extra;
-    } else {
-        ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
-        cl_buffer_region region = {(size_t)((char *)tensor->data - (char *)cl_ptr_base), ggml_nbytes(tensor)};
-        cl_int err;
-        cl_mem sub_buffer = clCreateSubBuffer(ctx->buffer, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
-        CL_CHECK(err);
-        ctx->sub_buffers.push_back(sub_buffer);
-        tensor->extra = sub_buffer;
-    }
-    tensor->backend = GGML_BACKEND_TYPE_GPU;
-}
-
-static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    cl_mem tensor_buffer = (cl_mem) tensor->extra;
-    CL_CHECK(clEnqueueWriteBuffer(queue, tensor_buffer, true, offset, size, data, 0, NULL, NULL));
-    CL_CHECK(clFinish(queue));
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    cl_mem tensor_buffer = (cl_mem) tensor->extra;
-    CL_CHECK(clEnqueueReadBuffer(queue, tensor_buffer, true, offset, size, data, 0, NULL, NULL));
-    CL_CHECK(clFinish(queue));
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_opencl_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-    ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
-    CL_CHECK(clEnqueueFillBuffer(queue, ctx->buffer, &value, sizeof(value), 0, buffer->size, 0, NULL, NULL));
-    CL_CHECK(clFinish(queue));
-}
-
-static void ggml_backend_opencl_buffer_reset(ggml_backend_buffer_t buffer) {
-    ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
-    for (auto * sub_buffer : ctx->sub_buffers) {
-        clReleaseMemObject(sub_buffer);
-    }
-    ctx->sub_buffers.clear();
-}
-
-static ggml_backend_buffer_i ggml_backend_opencl_buffer_interface = {
-    /* .get_name        = */ ggml_backend_opencl_buffer_get_name,
-    /* .free_buffer     = */ ggml_backend_opencl_buffer_free_buffer,
-    /* .get_base        = */ ggml_backend_opencl_buffer_get_base,
-    /* .init_tensor     = */ ggml_backend_opencl_buffer_init_tensor,
-    /* .set_tensor      = */ ggml_backend_opencl_buffer_set_tensor,
-    /* .get_tensor      = */ ggml_backend_opencl_buffer_get_tensor,
-    /* .cpy_tensor      = */ NULL,
-    /* .clear           = */ ggml_backend_opencl_buffer_clear,
-    /* .reset           = */ ggml_backend_opencl_buffer_reset,
-};
-
-// buffer type
-
-static const char * ggml_backend_opencl_buffer_type_name(ggml_backend_buffer_type_t buffer_type) {
-    return "OpenCL";
-
-    GGML_UNUSED(buffer_type);
-}
-
-static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buffer_type, size_t size) {
-    ggml_cl_init();
-
-    cl_int err;
-    cl_mem mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err);
-    if (err != CL_SUCCESS) {
-        fprintf(stderr, "%s: failed to allocate %.2f MiB\n", __func__, size / 1024.0 / 1024.0);
-        return nullptr;
-    }
-
-    ggml_backend_opencl_buffer_context * ctx = new ggml_backend_opencl_buffer_context{mem, {}};
-
-    return ggml_backend_buffer_init(buffer_type, ggml_backend_opencl_buffer_interface, ctx, size);
-}
-
-static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) {
-    // FIXME: not thread safe, device may not be initialized yet
-    static cl_uint alignment = -1;
-    if (alignment == (cl_uint)-1) {
-        ggml_cl_init();
-        clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &alignment, NULL);
-        alignment /= 8; // bits to bytes
-    }
-    return alignment;
-
-    GGML_UNUSED(buffer_type);
-}
-
-static size_t ggml_backend_opencl_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) {
-    static size_t max_size = -1;
-    if (max_size == (size_t)-1) {
-        ggml_cl_init();
-        clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &max_size, NULL);
-    }
-    return max_size;
-}
-
-static bool ggml_backend_opencl_buffer_type_supports_backend(ggml_backend_buffer_type_t buffer_type, ggml_backend_t backend) {
-    //return ggml_backend_is_opencl(backend); // opencl must be used through the cpu backend
-    return ggml_backend_is_cpu(backend);
-
-    GGML_UNUSED(buffer_type);
-}
-
-static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = {
-    /* .get_name         = */ ggml_backend_opencl_buffer_type_name,
-    /* .alloc_buffer     = */ ggml_backend_opencl_buffer_type_alloc_buffer,
-    /* .get_alignment    = */ ggml_backend_opencl_buffer_type_get_alignment,
-    /* .get_max_size     = */ ggml_backend_opencl_buffer_type_get_max_size,
-    /* .get_alloc_size   = */ NULL,
-    /* .supports_backend = */ ggml_backend_opencl_buffer_type_supports_backend,
-    /* .is_host          = */ NULL,
-};
-
-
-ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type() {
-    static ggml_backend_buffer_type buffer_type = {
-        /* .iface   = */ ggml_backend_opencl_buffer_type_interface,
-        /* .context = */ nullptr,
-    };
-
-    return &buffer_type;
-}
-
-#if 0
-// host buffer type
-
-static const char * ggml_backend_opencl_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
-    return "CL_Host";
-
-    GGML_UNUSED(buft);
-}
-
-static const char * ggml_backend_opencl_host_buffer_name(ggml_backend_buffer_t buffer) {
-    return "CL_Host";
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_opencl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    ggml_cl_host_free(buffer->context);
-}
-
-static ggml_backend_buffer_t ggml_backend_opencl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    void * ptr = ggml_cl_host_malloc(size);
-
-    if (ptr == nullptr) {
-        // fallback to cpu buffer
-        return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
-    }
-
-    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
-    buffer->buft = buft;
-    buffer->iface.get_name = ggml_backend_opencl_host_buffer_name;
-    buffer->iface.free_buffer = ggml_backend_opencl_host_buffer_free_buffer;
-
-    return buffer;
-}
-
-ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type() {
-    static struct ggml_backend_buffer_type ggml_backend_opencl_buffer_type_host = {
-        /* .iface    = */ {
-            /* .get_name         = */ ggml_backend_opencl_host_buffer_type_name,
-            /* .alloc_buffer     = */ ggml_backend_opencl_host_buffer_type_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
-            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
-            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
-            /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
-            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
-        },
-        /* .context  = */ nullptr,
-    };
-
-    return &ggml_backend_opencl_buffer_type_host;
-}
-
-// backend
-
-static const char * ggml_backend_opencl_name(ggml_backend_t backend) {
-    return "OpenCL";
-
-    GGML_UNUSED(backend);
-}
-
-static void ggml_backend_opencl_free(ggml_backend_t backend) {
-    GGML_UNUSED(backend);
-}
-
-static ggml_backend_buffer_type_t ggml_backend_opencl_get_default_buffer_type(ggml_backend_t backend) {
-    return ggml_backend_opencl_buffer_type();
-
-    GGML_UNUSED(backend);
-}
-
-static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
-    for (int i = 0; i < graph->n_nodes; ++i) {
-        ggml_tensor * node = graph->nodes[i];
-
-        if (ggml_is_empty(node)) {
-            continue;
-        }
-
-        switch (node->op) {
-            case GGML_OP_MUL_MAT:
-                ggml_cl_mul_mat(node->src[0], node->src[1], node, nullptr, 0);
-                break;
-            case GGML_OP_MUL:
-                ggml_cl_mul(node->src[0], node->src[1], node);
-                break;
-            default:
-                GGML_ASSERT(false);
-        }
-    }
-
-    return GGML_STATUS_SUCCESS;
-
-    GGML_UNUSED(backend);
-}
-
-static bool ggml_backend_opencl_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
-    switch (op->op) {
-        case GGML_OP_MUL_MAT:
-            return ggml_cl_can_mul_mat(op->src[0], op->src[1], op);
-        case GGML_OP_MUL:
-            // return ggml_can_repeat_rows(op->src[1], op->src[0]);
-            return true;
-        default:
-            return false;
-    }
-
-    GGML_UNUSED(backend);
-}
-
-static ggml_backend_i opencl_backend_i = {
-    /* .get_name                = */ ggml_backend_opencl_name,
-    /* .free                    = */ ggml_backend_opencl_free,
-    /* .get_default_buffer_type = */ ggml_backend_opencl_get_default_buffer_type,
-    /* .set_tensor_async        = */ NULL,
-    /* .get_tensor_async        = */ NULL,
-    /* .cpy_tensor_from_async   = */ NULL,
-    /* .cpy_tensor_to_async     = */ NULL,
-    /* .synchronize             = */ NULL,
-    /* .graph_plan_create       = */ NULL,
-    /* .graph_plan_free         = */ NULL,
-    /* .graph_plan_compute      = */ NULL,
-    /* .graph_compute           = */ ggml_backend_opencl_graph_compute,
-    /* .supports_op             = */ ggml_backend_opencl_supports_op,
-};
-
-ggml_backend_t ggml_backend_opencl_init() {
-    ggml_backend_t backend = new ggml_backend {
-        /* .interface = */ opencl_backend_i,
-        /* .context   = */ nullptr
-    };
-
-    return backend;
-}
-
-bool ggml_backend_is_opencl(ggml_backend_t backend) {
-    return backend && backend->iface.get_name == ggml_backend_opencl_name;
-}
-#endif
diff --git a/ggml-opencl.h b/ggml-opencl.h
deleted file mode 100644
index 257a6be6a..000000000
--- a/ggml-opencl.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-GGML_API void ggml_cl_init(void);
-
-GGML_API void   ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-GGML_API void   ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-GGML_API bool   ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst);
-GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-GGML_API void   ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
-
-// GGML_API void * ggml_cl_host_malloc(size_t size);
-// GGML_API void   ggml_cl_host_free(void * ptr);
-
-GGML_API void ggml_cl_free_data(const struct ggml_tensor* tensor);
-
-GGML_API void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
-
-// backend API
-
-// GGML_API ggml_backend_t ggml_backend_opencl_init(void);
-
-// GGML_API bool ggml_backend_is_opencl(ggml_backend_t backend);
-
-GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
-// GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);
-
-#ifdef  __cplusplus
-}
-#endif
diff --git a/ggml.c b/ggml.c
index 8869e146a..11e5c34ab 100644
--- a/ggml.c
+++ b/ggml.c
@@ -297,17 +297,12 @@ inline static void * ggml_calloc(size_t num, size_t size) {
 
 #if defined(GGML_USE_ACCELERATE)
 #include <Accelerate/Accelerate.h>
-#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
-#include "ggml-opencl.h"
-#endif
 #elif defined(GGML_USE_OPENBLAS)
 #if defined(GGML_BLAS_USE_MKL)
 #include <mkl.h>
 #else
 #include <cblas.h>
 #endif
-#elif defined(GGML_USE_CLBLAST)
-#include "ggml-opencl.h"
 #endif
 
 // floating point type used to accumulate sums
@@ -3380,10 +3375,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
             GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
         }
 
-#if defined(GGML_USE_CLBLAST)
-        ggml_cl_init();
-#endif
-
         ggml_setup_op_has_task_pass();
 
         is_first_call = false;
@@ -9053,17 +9044,6 @@ static void ggml_compute_forward_add_f32(
     const int ith = params->ith;
     const int nth = params->nth;
 
-#ifdef GGML_USE_CLBLAST
-    if (src1->backend == GGML_BACKEND_TYPE_GPU) {
-        // TODO: OpenCL kernel support full broadcast
-        GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
-        if (ith == 0) {
-            ggml_cl_add(src0, src1, dst);
-        }
-        return;
-    }
-#endif
-
     const int nr  = ggml_nrows(src0);
 
     GGML_TENSOR_BINARY_OP_LOCALS
@@ -10171,17 +10151,6 @@ static void ggml_compute_forward_mul_f32(
     const int ith = params->ith;
     const int nth = params->nth;
 
-#if defined(GGML_USE_CLBLAST)
-    if (src1->backend == GGML_BACKEND_TYPE_GPU) {
-        // TODO: OpenCL kernel support full broadcast
-        GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
-        if (ith == 0) {
-            ggml_cl_mul(src0, src1, dst);
-        }
-        return;
-    }
-#endif
-
     const int64_t nr = ggml_nrows(src0);
 
     GGML_TENSOR_BINARY_OP_LOCALS
@@ -12417,15 +12386,6 @@ static void ggml_compute_forward_mul_mat(
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 
-#if defined(GGML_USE_CLBLAST)
-    if (ggml_cl_can_mul_mat(src0, src1, dst)) {
-        if (params->ith == 0 && params->type == GGML_TASK_TYPE_COMPUTE) {
-            ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
-        }
-        return;
-    }
-#endif
-
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
     if (ggml_compute_forward_mul_mat_use_blas(dst)) {
         const int64_t ne_plane      = ne01*ne00;
@@ -12873,8 +12833,6 @@ static void ggml_compute_forward_out_prod_f32(
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 
-    // TODO: #if defined(GGML_USE_CLBLAST)
-
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
     bool use_blas = ggml_is_matrix(src0) &&
         ggml_is_matrix(src1) &&
@@ -13072,7 +13030,7 @@ static void ggml_compute_forward_out_prod_q_f32(
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
 
-    // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
+    // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
 
     if (params->type == GGML_TASK_TYPE_INIT) {
         if (ith != 0) {
@@ -19546,11 +19504,6 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
                 {
                     const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
 
-#if defined(GGML_USE_CLBLAST)
-                    if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
-                        cur = ggml_cl_mul_mat_get_wsize(node->src[0], node->src[1], node);
-                    } else
-#endif
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
                     if (ggml_compute_forward_mul_mat_use_blas(node)) {
                         if (node->src[0]->type != GGML_TYPE_F32) {
@@ -22859,7 +22812,7 @@ int ggml_cpu_has_wasm_simd(void) {
 }
 
 int ggml_cpu_has_blas(void) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
     return 1;
 #else
     return 0;
@@ -22874,14 +22827,6 @@ int ggml_cpu_has_cuda(void) {
 #endif
 }
 
-int ggml_cpu_has_clblast(void) {
-#if defined(GGML_USE_CLBLAST)
-    return 1;
-#else
-    return 0;
-#endif
-}
-
 int ggml_cpu_has_vulkan(void) {
 #if defined(GGML_USE_VULKAN)
     return 1;
@@ -22915,8 +22860,7 @@ int ggml_cpu_has_rpc(void) {
 }
 
 int ggml_cpu_has_gpublas(void) {
-    return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() ||
-           ggml_cpu_has_sycl();
+    return ggml_cpu_has_cuda() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() || ggml_cpu_has_sycl();
 }
 
 int ggml_cpu_has_sse3(void) {
diff --git a/ggml.h b/ggml.h
index f38699698..addcf1bfe 100644
--- a/ggml.h
+++ b/ggml.h
@@ -2425,7 +2425,6 @@ extern "C" {
     GGML_API int ggml_cpu_has_wasm_simd  (void);
     GGML_API int ggml_cpu_has_blas       (void);
     GGML_API int ggml_cpu_has_cuda       (void);
-    GGML_API int ggml_cpu_has_clblast    (void);
     GGML_API int ggml_cpu_has_vulkan     (void);
     GGML_API int ggml_cpu_has_kompute    (void);
     GGML_API int ggml_cpu_has_gpublas    (void);
diff --git a/llama.cpp b/llama.cpp
index 92c33f53e..c05e2bdb7 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -13,8 +13,6 @@
 
 #ifdef GGML_USE_CUDA
 #  include "ggml-cuda.h"
-#elif defined(GGML_USE_CLBLAST)
-#  include "ggml-opencl.h"
 #elif defined(GGML_USE_VULKAN)
 #  include "ggml-vulkan.h"
 #elif defined(GGML_USE_SYCL)
@@ -2406,8 +2404,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
     buft = ggml_backend_vk_buffer_type(gpu);
 #elif defined(GGML_USE_SYCL)
     buft = ggml_backend_sycl_buffer_type(gpu);
-#elif defined(GGML_USE_CLBLAST)
-    buft = ggml_backend_opencl_buffer_type();
 #elif defined(GGML_USE_KOMPUTE)
     buft = ggml_backend_kompute_buffer_type(gpu);
     if (buft == nullptr) {
@@ -2530,10 +2526,6 @@ static bool llama_kv_cache_init(
         }
     }
 
-#ifdef GGML_USE_CLBLAST
-    offload = false;
-#endif
-
     // count used buffer types
     std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
     if (offload) {
@@ -15921,7 +15913,7 @@ bool llama_supports_mlock(void) {
 }
 
 bool llama_supports_gpu_offload(void) {
-#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
+#if defined(GGML_USE_CUDA) || defined(GGML_USE_METAL)   || defined(GGML_USE_VULKAN) || \
     defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
     // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
     return true;
diff --git a/scripts/LlamaConfig.cmake.in b/scripts/LlamaConfig.cmake.in
index 92e39708b..9311055d9 100644
--- a/scripts/LlamaConfig.cmake.in
+++ b/scripts/LlamaConfig.cmake.in
@@ -5,7 +5,6 @@ set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
 set(LLAMA_BLAS @LLAMA_BLAS@)
 set(LLAMA_CUDA @LLAMA_CUDA@)
 set(LLAMA_METAL @LLAMA_METAL@)
-set(LLAMA_CLBLAST @LLAMA_CLBLAST@)
 set(LLAMA_HIPBLAS @LLAMA_HIPBLAS@)
 set(LLAMA_ACCELERATE @LLAMA_ACCELERATE@)
 
@@ -36,10 +35,6 @@ if (LLAMA_METAL)
     find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
 endif()
 
-if (LLAMA_CLBLAST)
-    find_package(CLBlast REQUIRED)
-endif()
-
 if (LLAMA_HIPBLAS)
     find_package(hip REQUIRED)
     find_package(hipblas REQUIRED)
diff --git a/scripts/compare-llama-bench.py b/scripts/compare-llama-bench.py
index 6016eb2c0..513dde5e1 100755
--- a/scripts/compare-llama-bench.py
+++ b/scripts/compare-llama-bench.py
@@ -19,17 +19,17 @@ logger = logging.getLogger("compare-llama-bench")
 
 # Properties by which to differentiate results per commit:
 KEY_PROPERTIES = [
-    "cpu_info", "gpu_info", "n_gpu_layers", "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas",
+    "cpu_info", "gpu_info", "n_gpu_layers", "cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas",
     "blas", "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "embeddings", "n_threads",
     "type_k", "type_v", "use_mmap", "no_kv_offload", "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen"
 ]
 
 # Properties that are boolean and are converted to Yes/No for the table:
-BOOL_PROPERTIES = ["cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas", "embeddings", "use_mmap", "no_kv_offload", "flash_attn"]
+BOOL_PROPERTIES = ["cuda", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas", "embeddings", "use_mmap", "no_kv_offload", "flash_attn"]
 
 # Header names for the table:
 PRETTY_NAMES = {
-    "cuda": "CUDA", "opencl": "OpenCL", "vulkan": "Vulkan", "kompute": "Kompute", "metal": "Metal", "sycl": "SYCL", "rpc": "RPC",
+    "cuda": "CUDA", "vulkan": "Vulkan", "kompute": "Kompute", "metal": "Metal", "sycl": "SYCL", "rpc": "RPC",
     "gpu_blas": "GPU BLAS", "blas": "BLAS", "cpu_info": "CPU", "gpu_info": "GPU", "model_filename": "File", "model_type": "Model",
     "model_size": "Model Size [GiB]", "model_n_params": "Num. of Par.", "n_batch": "Batch size", "n_ubatch": "Microbatch size",
     "n_threads": "Threads", "type_k": "K type", "type_v": "V type", "n_gpu_layers": "GPU layers", "split_mode": "Split mode",
diff --git a/scripts/server-llm.sh b/scripts/server-llm.sh
index eb6ce458e..b3715e204 100644
--- a/scripts/server-llm.sh
+++ b/scripts/server-llm.sh
@@ -3,7 +3,7 @@
 # Helper script for deploying llama.cpp server with a single Bash command
 #
 # - Works on Linux and macOS
-# - Supports: CPU, CUDA, Metal, OpenCL
+# - Supports: CPU, CUDA, Metal
 # - Can run all GGUF models from HuggingFace
 # - Can serve requests in parallel
 # - Always builds latest llama.cpp from GitHub
@@ -19,7 +19,7 @@
 #   --port:            port number, default is 8888
 #   --repo:            path to a repo containing GGUF model files
 #   --wtype:           weights type (f16, q8_0, q4_0, q4_1), default is user-input
-#   --backend:         cpu, cuda, metal, opencl, depends on the OS
+#   --backend:         cpu, cuda, metal, depends on the OS
 #   --gpu-id:          gpu id, default is 0
 #   --n-parallel:      number of parallel requests, default is 8
 #   --n-kv:            KV cache size, default is 4096
@@ -72,7 +72,7 @@ function print_usage {
     printf "  --port:             port number, default is 8888\n"
     printf "  --repo:             path to a repo containing GGUF model files\n"
     printf "  --wtype:            weights type (f16, q8_0, q4_0, q4_1), default is user-input\n"
-    printf "  --backend:          cpu, cuda, metal, opencl, depends on the OS\n"
+    printf "  --backend:          cpu, cuda, metal, depends on the OS\n"
     printf "  --gpu-id:           gpu id, default is 0\n"
     printf "  --n-parallel:       number of parallel requests, default is 8\n"
     printf "  --n-kv:             KV cache size, default is 4096\n"
@@ -387,9 +387,6 @@ elif [[ "$backend" == "cpu" ]]; then
 elif [[ "$backend" == "metal" ]]; then
     printf "[+] Building with Metal backend\n"
     make -j server $log
-elif [[ "$backend" == "opencl" ]]; then
-    printf "[+] Building with OpenCL backend\n"
-    LLAMA_CLBLAST=1 make -j server $log
 else
     printf "[-] Unknown backend: %s\n" "$backend"
     exit 1
@@ -407,8 +404,6 @@ elif [[ "$backend" == "cpu" ]]; then
     args="-ngl 0"
 elif [[ "$backend" == "metal" ]]; then
     args="-ngl 999"
-elif [[ "$backend" == "opencl" ]]; then
-    args="-ngl 999"
 else
     printf "[-] Unknown backend: %s\n" "$backend"
     exit 1
diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh
index 3f8ddf37b..9e34dc8b9 100755
--- a/scripts/sync-ggml-am.sh
+++ b/scripts/sync-ggml-am.sh
@@ -106,8 +106,6 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
     # src/ggml-kompute.h          -> ggml-kompute.h
     # src/ggml-metal.h            -> ggml-metal.h
     # src/ggml-metal.m            -> ggml-metal.m
-    # src/ggml-opencl.cpp         -> ggml-opencl.cpp
-    # src/ggml-opencl.h           -> ggml-opencl.h
     # src/ggml-quants.c           -> ggml-quants.c
     # src/ggml-quants.h           -> ggml-quants.h
     # src/ggml-rpc.cpp            -> ggml-rpc.cpp
@@ -143,8 +141,6 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
         -e 's/src\/ggml-kompute\.h/ggml-kompute.h/g' \
         -e 's/src\/ggml-metal\.h/ggml-metal.h/g' \
         -e 's/src\/ggml-metal\.m/ggml-metal.m/g' \
-        -e 's/src\/ggml-opencl\.cpp/ggml-opencl.cpp/g' \
-        -e 's/src\/ggml-opencl\.h/ggml-opencl.h/g' \
         -e 's/src\/ggml-quants\.c/ggml-quants.c/g' \
         -e 's/src\/ggml-quants\.h/ggml-quants.h/g' \
         -e 's/src\/ggml-rpc\.cpp/ggml-rpc.cpp/g' \
diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh
index fbae6b7f8..4843f8a4a 100755
--- a/scripts/sync-ggml.sh
+++ b/scripts/sync-ggml.sh
@@ -14,8 +14,6 @@ cp -rpv ../ggml/src/ggml-kompute.h          ./ggml-kompute.h
 cp -rpv ../ggml/src/ggml-metal.h            ./ggml-metal.h
 cp -rpv ../ggml/src/ggml-metal.m            ./ggml-metal.m
 cp -rpv ../ggml/src/ggml-metal.metal        ./ggml-metal.metal
-cp -rpv ../ggml/src/ggml-opencl.cpp         ./ggml-opencl.cpp
-cp -rpv ../ggml/src/ggml-opencl.h           ./ggml-opencl.h
 cp -rpv ../ggml/src/ggml-quants.c           ./ggml-quants.c
 cp -rpv ../ggml/src/ggml-quants.h           ./ggml-quants.h
 cp -rpv ../ggml/src/ggml-rpc.cpp            ./ggml-rpc.cpp

From 1442677f92e45a475be7b4d056e3633d1d6f813b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 4 Jun 2024 21:23:39 +0300
Subject: [PATCH 07/39] common : refactor cli arg parsing (#7675)

* common : gpt_params_parse do not print usage

* common : rework usage print (wip)

* common : valign

* common : rework print_usage

* infill : remove cfg support

* common : reorder args

* server : deduplicate parameters

ggml-ci

* common : add missing header

ggml-ci

* common : remote --random-prompt usages

ggml-ci

* examples : migrate to gpt_params

ggml-ci

* batched-bench : migrate to gpt_params

* retrieval : migrate to gpt_params

* common : change defaults for escape and n_ctx

* common : remove chatml and instruct params

ggml-ci

* common : passkey use gpt_params
---
 common/common.cpp                            | 819 +++++++++++++------
 common/common.h                              | 105 ++-
 examples/batched-bench/README.md             |   8 +-
 examples/batched-bench/batched-bench.cpp     |  92 +--
 examples/batched/README.md                   |   2 +-
 examples/batched/batched.cpp                 |  73 +-
 examples/embedding/embedding.cpp             |   4 +-
 examples/eval-callback/eval-callback.cpp     |   6 +-
 examples/gguf-split/tests.sh                 |  10 +-
 examples/gritlm/gritlm.cpp                   |   2 +
 examples/imatrix/imatrix.cpp                 |   8 +-
 examples/infill/infill.cpp                   | 134 +--
 examples/llama-bench/llama-bench.cpp         |  48 +-
 examples/llava/llava-cli.cpp                 |  14 +-
 examples/lookahead/lookahead.cpp             |   3 +-
 examples/lookup/lookup-create.cpp            |   2 +
 examples/lookup/lookup-stats.cpp             |   1 +
 examples/lookup/lookup.cpp                   |   1 +
 examples/main/README.md                      |   5 +-
 examples/main/main.cpp                       |  69 +-
 examples/parallel/parallel.cpp               |   3 +-
 examples/passkey/README.md                   |   2 +-
 examples/passkey/passkey.cpp                 |  68 +-
 examples/perplexity/perplexity.cpp           |  12 +-
 examples/quantize/tests.sh                   |   4 +-
 examples/retrieval/retrieval.cpp             |  90 +-
 examples/save-load-state/save-load-state.cpp |   1 +
 examples/server/server.cpp                   | 700 ++--------------
 examples/server/utils.hpp                    |   7 -
 examples/simple/README.md                    |   2 +-
 examples/simple/simple.cpp                   |  50 +-
 examples/speculative/speculative.cpp         |   3 +-
 llama.cpp                                    |   2 +-
 scripts/run-with-preset.py                   |   4 +-
 34 files changed, 899 insertions(+), 1455 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index df583db83..c8df9a4ce 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -7,20 +7,21 @@
 
 #include <algorithm>
 #include <cassert>
+#include <cinttypes>
 #include <cmath>
+#include <codecvt>
+#include <cstdarg>
 #include <cstring>
 #include <ctime>
 #include <fstream>
-#include <iterator>
 #include <iostream>
+#include <iterator>
 #include <regex>
 #include <sstream>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
-#include <cinttypes>
-#include <codecvt>
 
 #if defined(__APPLE__) && defined(__MACH__)
 #include <sys/types.h>
@@ -237,10 +238,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
         }
     }
 
-    if (params.prompt_cache_all &&
-            (params.interactive || params.interactive_first ||
-             params.instruct)) {
-
+    if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
         throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
     }
 
@@ -265,22 +263,25 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
 }
 
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
-    bool result = true;
+    const auto params_org = params; // the example can modify the default params
+
     try {
-        if (!gpt_params_parse_ex(argc, argv, params)) {
-            gpt_params_print_usage(argc, argv, gpt_params());
-            exit(0);
+        if (!gpt_params_parse_ex(argc, argv, params) || params.usage) {
+            params = params_org;
+            params.usage = true;
+            return false;
         }
-    }
-    catch (const std::invalid_argument & ex) {
+    } catch (const std::invalid_argument & ex) {
         fprintf(stderr, "%s\n", ex.what());
-        gpt_params_print_usage(argc, argv, gpt_params());
-        exit(1);
+        return false;
     }
-    return result;
+
+    return true;
 }
 
 bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
+    const char split_delim = ',';
+
     llama_sampling_params & sparams = params.sparams;
 
     if (arg == "-s" || arg == "--seed") {
@@ -288,7 +289,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
             invalid_param = true;
             return true;
         }
-        // This is temporary, in the future the samplign state will be moved fully to llama_sampling_context.
+        // TODO: this is temporary, in the future the sampling state will be moved fully to llama_sampling_context.
         params.seed = std::stoul(argv[i]);
         sparams.seed = std::stoul(argv[i]);
         return true;
@@ -349,6 +350,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.escape = true;
         return true;
     }
+    if (arg == "--no-escape") {
+        params.escape = false;
+        return true;
+    }
     if (arg == "--prompt-cache") {
         if (++i >= argc) {
             invalid_param = true;
@@ -403,7 +408,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         }
         return true;
     }
-    if (arg == "-n" || arg == "--n-predict") {
+    if (arg == "-n" || arg == "--predict" || arg == "--n-predict") {
         if (++i >= argc) {
             invalid_param = true;
             return true;
@@ -900,34 +905,22 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.interactive = true;
         return true;
     }
-    if (arg == "--interactive-specials") {
-        params.interactive_specials = true;
-        return true;
-    }
-    if (arg == "--special") {
+    if (arg == "-sp" || arg == "--special") {
         params.special = true;
         return true;
     }
-    if (arg == "--embedding") {
+    if (arg == "--embedding" || arg == "--embeddings") {
         params.embedding = true;
         return true;
     }
-    if (arg == "--interactive-first") {
+    if (arg == "-if" || arg == "--interactive-first") {
         params.interactive_first = true;
         return true;
     }
-    if (arg == "-ins" || arg == "--instruct") {
-        params.instruct = true;
-        return true;
-    }
     if (arg == "-cnv" || arg == "--conversation") {
         params.conversation = true;
         return true;
     }
-    if (arg == "-cml" || arg == "--chatml") {
-        params.chatml = true;
-        return true;
-    }
     if (arg == "--infill") {
         params.infill = true;
         return true;
@@ -964,7 +957,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.flash_attn = true;
         return true;
     }
-    if (arg == "--color") {
+    if (arg == "-co" || arg == "--color") {
         params.use_color = true;
         return true;
     }
@@ -972,26 +965,26 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.use_mlock = true;
         return true;
     }
-    if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
+    if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") {
         if (++i >= argc) {
             invalid_param = true;
             return true;
         }
         params.n_gpu_layers = std::stoi(argv[i]);
         if (!llama_supports_gpu_offload()) {
-            fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
+            fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
             fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
         }
         return true;
     }
-    if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") {
+    if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--gpu-layers-draft") {
         if (++i >= argc) {
             invalid_param = true;
             return true;
         }
         params.n_gpu_layers_draft = std::stoi(argv[i]);
         if (!llama_supports_gpu_offload()) {
-            fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
+            fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
             fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
         }
         return true;
@@ -1087,6 +1080,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         else { invalid_param = true; }
         return true;
     }
+    if (arg == "-v" || arg == "--verbose") {
+        params.verbose = true;
+        return true;
+    }
     if (arg == "--verbose-prompt") {
         params.verbose_prompt = true;
         return true;
@@ -1151,6 +1148,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.ppl_stride = std::stoi(argv[i]);
         return true;
     }
+    if (arg == "--ppl-output-type") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.ppl_output_type = std::stoi(argv[i]);
+        return true;
+    }
     if (arg == "-ptc" || arg == "--print-token-count") {
         if (++i >= argc) {
             invalid_param = true;
@@ -1163,14 +1168,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.check_tensors = true;
         return true;
     }
-    if (arg == "--ppl-output-type") {
-        if (++i >= argc) {
-            invalid_param = true;
-            return true;
-        }
-        params.ppl_output_type = std::stoi(argv[i]);
-        return true;
-    }
     if (arg == "--hellaswag") {
         params.hellaswag = true;
         return true;
@@ -1242,19 +1239,15 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         }
         return true;
     }
-    if (arg == "-h" || arg == "--help") {
-        gpt_params_print_usage(argc, argv, gpt_params());
-        exit(0);
+    if (arg == "-h" || arg == "--help" || arg == "--usage"  ) {
+        params.usage = true;
+        return true;
     }
     if (arg == "--version") {
         fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
         fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
         exit(0);
     }
-    if (arg == "--random-prompt") {
-        params.random_prompt = true;
-        return true;
-    }
     if (arg == "--in-prefix-bos") {
         params.input_prefix_bos = true;
         return true;
@@ -1321,6 +1314,229 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         }
         return true;
     }
+    if (arg == "--host") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.hostname = argv[i];
+        return true;
+    }
+    if (arg == "--port") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.port = std::stoi(argv[i]);
+        return true;
+    }
+    if (arg == "--path") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.public_path = argv[i];
+        return true;
+    }
+    if (arg == "--api-key") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.api_keys.push_back(argv[i]);
+        return true;
+    }
+    if (arg == "--api-key-file") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        std::ifstream key_file(argv[i]);
+        if (!key_file) {
+            fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+            invalid_param = true;
+            return true;
+        }
+        std::string key;
+        while (std::getline(key_file, key)) {
+            if (!key.empty()) {
+                params.api_keys.push_back(key);
+            }
+        }
+        key_file.close();
+        return true;
+    }
+    if (arg == "--ssl-key-file") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.ssl_file_key = argv[i];
+        return true;
+    }
+    if (arg == "--ssl-cert-file") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.ssl_file_cert = argv[i];
+        return true;
+    }
+    if (arg == "--timeout" || arg == "-to") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.timeout_read  = std::stoi(argv[i]);
+        params.timeout_write = std::stoi(argv[i]);
+        return true;
+    }
+    if (arg == "-spf" || arg == "--system-prompt-file") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        std::ifstream file(argv[i]);
+        if (!file) {
+            fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+            invalid_param = true;
+            return true;
+        }
+        std::string system_prompt;
+        std::copy(
+                std::istreambuf_iterator<char>(file),
+                std::istreambuf_iterator<char>(),
+                std::back_inserter(system_prompt)
+                );
+        params.system_prompt = system_prompt;
+        return true;
+    }
+    if (arg == "--log-format") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        if (std::strcmp(argv[i], "json") == 0) {
+            params.log_json = true;
+        } else if (std::strcmp(argv[i], "text") == 0) {
+            params.log_json = false;
+        } else {
+            invalid_param = true;
+            return true;
+        }
+        return true;
+    }
+    if (arg == "--no-slots") {
+        params.endpoint_slots = false;
+        return true;
+    }
+    if (arg == "--metrics") {
+        params.endpoint_metrics = true;
+        return true;
+    }
+    if (arg == "--slot-save-path") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.slot_save_path = argv[i];
+        // if doesn't end with DIRECTORY_SEPARATOR, add it
+        if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
+            params.slot_save_path += DIRECTORY_SEPARATOR;
+        }
+        return true;
+    }
+    if (arg == "--chat-template") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        if (!llama_chat_verify_template(argv[i])) {
+            fprintf(stderr, "error: the supplied chat template is not supported: %s\n", argv[i]);
+            fprintf(stderr, "note: llama.cpp does not use jinja parser, we only support commonly used templates\n");
+            invalid_param = true;
+            return true;
+        }
+        params.chat_template = argv[i];
+        return true;
+    }
+    if (arg == "-pps") {
+        params.is_pp_shared = true;
+        return true;
+    }
+    if (arg == "-npp") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        auto p = string_split<int>(argv[i], split_delim);
+        params.n_pp.insert(params.n_pp.end(), p.begin(), p.end());
+        return true;
+    }
+    if (arg == "-ntg") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        auto p = string_split<int>(argv[i], split_delim);
+        params.n_tg.insert(params.n_tg.end(), p.begin(), p.end());
+        return true;
+    }
+    if (arg == "-npl") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        auto p = string_split<int>(argv[i], split_delim);
+        params.n_pl.insert(params.n_pl.end(), p.begin(), p.end());
+        return true;
+    }
+    if (arg == "--context-file") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        std::ifstream file(argv[i], std::ios::binary);
+        if (!file) {
+            fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+            invalid_param = true;
+            return true;
+        }
+        params.context_files.push_back(argv[i]);
+        return true;
+    }
+    if (arg == "--chunk-size") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.chunk_size = std::stoi(argv[i]);
+        return true;
+    }
+    if (arg == "--chunk-separator") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.chunk_separator = argv[i];
+        return true;
+    }
+    if (arg == "--junk") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.n_junk = std::stoi(argv[i]);
+        return true;
+    }
+    if (arg == "--pos") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.i_pos = std::stoi(argv[i]);
+        return true;
+    }
 #ifndef LOG_DISABLE_LOGS
     // Parse args for logging parameters
     if (log_param_single_parse(argv[i])) {
@@ -1348,6 +1564,16 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
     return false;
 }
 
+#ifdef __GNUC__
+#ifdef __MINGW32__
+#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+#else
+#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
+#endif
+#else
+#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
+#endif
+
 void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     const llama_sampling_params & sparams = params.sparams;
 
@@ -1359,198 +1585,290 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     }
     sampler_type_names.pop_back();
 
-    printf("\n");
-    printf("usage: %s [options]\n", argv[0]);
-    printf("\n");
-    printf("options:\n");
-    printf("  -h, --help            show this help message and exit\n");
-    printf("  --version             show version and build info\n");
-    printf("  -i, --interactive     run in interactive mode\n");
-    printf("  --special             special tokens output enabled\n");
-    printf("  --interactive-specials allow special tokens in user text, in interactive mode\n");
-    printf("  --interactive-first   run in interactive mode and wait for input right away\n");
-    printf("  -cnv, --conversation  run in conversation mode (does not print special tokens and suffix/prefix)\n");
-    printf("  -ins, --instruct      run in instruction mode (use with Alpaca models)\n");
-    printf("  -cml, --chatml        run in chatml mode (use with ChatML-compatible models)\n");
-    printf("  --multiline-input     allows you to write or paste multiple lines without ending each in '\\'\n");
-    printf("  -r PROMPT, --reverse-prompt PROMPT\n");
-    printf("                        halt generation at PROMPT, return control in interactive mode\n");
-    printf("                        (can be specified more than once for multiple prompts).\n");
-    printf("  --color               colorise output to distinguish prompt and user input from generations\n");
-    printf("  -s SEED, --seed SEED  RNG seed (default: -1, use random seed for < 0)\n");
-    printf("  -t N, --threads N     number of threads to use during generation (default: %d)\n", params.n_threads);
-    printf("  -tb N, --threads-batch N\n");
-    printf("                        number of threads to use during batch and prompt processing (default: same as --threads)\n");
-    printf("  -td N, --threads-draft N");
-    printf("                        number of threads to use during generation (default: same as --threads)\n");
-    printf("  -tbd N, --threads-batch-draft N\n");
-    printf("                        number of threads to use during batch and prompt processing (default: same as --threads-draft)\n");
-    printf("  -p PROMPT, --prompt PROMPT\n");
-    printf("                        prompt to start generation with (default: empty)\n");
-    printf("  -e, --escape          process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
-    printf("  --prompt-cache FNAME  file to cache prompt state for faster startup (default: none)\n");
-    printf("  --prompt-cache-all    if specified, saves user input and generations to cache as well.\n");
-    printf("                        not supported with --interactive or other interactive options\n");
-    printf("  --prompt-cache-ro     if specified, uses the prompt cache but does not update it.\n");
-    printf("  --random-prompt       start with a randomized prompt.\n");
-    printf("  --in-prefix-bos       prefix BOS to user inputs, preceding the `--in-prefix` string\n");
-    printf("  --in-prefix STRING    string to prefix user inputs with (default: empty)\n");
-    printf("  --in-suffix STRING    string to suffix after user inputs with (default: empty)\n");
-    printf("  -f FNAME, --file FNAME\n");
-    printf("                        prompt file to start generation.\n");
-    printf("  -bf FNAME, --binary-file FNAME\n");
-    printf("                        binary file containing multiple choice tasks.\n");
-    printf("  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
-    printf("  -c N, --ctx-size N    size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
-    printf("  -b N, --batch-size N  logical maximum batch size (default: %d)\n", params.n_batch);
-    printf("  -ub N, --ubatch-size N\n");
-    printf("                        physical maximum batch size (default: %d)\n", params.n_ubatch);
-    printf("  --samplers            samplers that will be used for generation in the order, separated by \';\'\n");
-    printf("                        (default: %s)\n", sampler_type_names.c_str());
-    printf("  --sampling-seq        simplified sequence for samplers that will be used (default: %s)\n", sampler_type_chars.c_str());
-    printf("  --top-k N             top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
-    printf("  --top-p N             top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
-    printf("  --min-p N             min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
-    printf("  --tfs N               tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
-    printf("  --typical N           locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
-    printf("  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
-    printf("  --repeat-penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat);
-    printf("  --presence-penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present);
-    printf("  --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq);
-    printf("  --dynatemp-range N    dynamic temperature range (default: %.1f, 0.0 = disabled)\n", (double)sparams.dynatemp_range);
-    printf("  --dynatemp-exp N      dynamic temperature exponent (default: %.1f)\n", (double)sparams.dynatemp_exponent);
-    printf("  --mirostat N          use Mirostat sampling.\n");
-    printf("                        Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
-    printf("                        (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat);
-    printf("  --mirostat-lr N       Mirostat learning rate, parameter eta (default: %.1f)\n", (double)sparams.mirostat_eta);
-    printf("  --mirostat-ent N      Mirostat target entropy, parameter tau (default: %.1f)\n", (double)sparams.mirostat_tau);
-    printf("  -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
-    printf("                        modifies the likelihood of token appearing in the completion,\n");
-    printf("                        i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
-    printf("                        or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
-    printf("  --grammar GRAMMAR     BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
-    printf("  --grammar-file FNAME  file to read grammar from\n");
-    printf("  -j SCHEMA, --json-schema SCHEMA\n");
-    printf("                        JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object.\n");
-    printf("                        For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead\n");
-    printf("  --cfg-negative-prompt PROMPT\n");
-    printf("                        negative prompt to use for guidance. (default: empty)\n");
-    printf("  --cfg-negative-prompt-file FNAME\n");
-    printf("                        negative prompt file to use for guidance. (default: empty)\n");
-    printf("  --cfg-scale N         strength of guidance (default: %f, 1.0 = disable)\n", sparams.cfg_scale);
-    printf("  --rope-scaling {none,linear,yarn}\n");
-    printf("                        RoPE frequency scaling method, defaults to linear unless specified by the model\n");
-    printf("  --rope-scale N        RoPE context scaling factor, expands context by a factor of N\n");
-    printf("  --rope-freq-base N    RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
-    printf("  --rope-freq-scale N   RoPE frequency scaling factor, expands context by a factor of 1/N\n");
-    printf("  --yarn-orig-ctx N     YaRN: original context size of model (default: 0 = model training context size)\n");
-    printf("  --yarn-ext-factor N   YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n");
-    printf("  --yarn-attn-factor N  YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
-    printf("  --yarn-beta-slow N    YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
-    printf("  --yarn-beta-fast N    YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
-    printf("  --pooling {none,mean,cls}\n");
-    printf("                        pooling type for embeddings, use model default if unspecified\n");
-    printf("  -dt N, --defrag-thold N\n");
-    printf("                        KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold);
-    printf("  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
-    printf("  --penalize-nl         penalize newline tokens\n");
-    printf("  --temp N              temperature (default: %.1f)\n", (double)sparams.temp);
-    printf("  --all-logits          return logits for all tokens in the batch (default: disabled)\n");
-    printf("  --hellaswag           compute HellaSwag score over random tasks from datafile supplied with -f\n");
-    printf("  --hellaswag-tasks N   number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
-    printf("  --winogrande          compute Winogrande score over random tasks from datafile supplied with -f\n");
-    printf("  --winogrande-tasks N  number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks);
-    printf("  --multiple-choice     compute multiple choice score over random tasks from datafile supplied with -f\n");
-    printf("  --multiple-choice-tasks N number of tasks to use when computing the multiple choice score (default: %zu)\n", params.winogrande_tasks);
-    printf("  --kl-divergence       computes KL-divergence to logits provided via --kl-divergence-base\n");
-    printf("  --keep N              number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
-    printf("  --draft N             number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
-    printf("  --chunks N            max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
-    printf("  -np N, --parallel N   number of parallel sequences to decode (default: %d)\n", params.n_parallel);
-    printf("  -ns N, --sequences N  number of sequences to decode (default: %d)\n", params.n_sequences);
-    printf("  -ps N, --p-split N    speculative decoding split probability (default: %.1f)\n", (double)params.p_split);
-    printf("  -cb, --cont-batching  enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
-    printf("  -fa, --flash-attn     enable Flash Attention (default: %s)\n", params.flash_attn ? "enabled" : "disabled");
-    printf("  --mmproj MMPROJ_FILE  path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
-    printf("  --image IMAGE_FILE    path to an image file. use with multimodal models. Specify multiple times for batching\n");
+    struct option_info {
+        LLAMA_COMMON_ATTRIBUTE_FORMAT(4, 5)
+        option_info(const std::string & tags, const char * args, const char * desc, ...) : tags(tags), args(args), desc(desc) {
+            va_list args_list;
+            va_start(args_list, desc);
+            char buffer[1024];
+            vsnprintf(buffer, sizeof(buffer), desc, args_list);
+            va_end(args_list);
+            this->desc = buffer;
+        }
+
+        option_info(const std::string & grp) : grp(grp) {}
+
+        std::string tags;
+        std::string args;
+        std::string desc;
+        std::string grp;
+    };
+
+    std::vector<option_info> options;
+
+    // TODO: filter by tags
+
+    options.push_back({ "general" });
+    options.push_back({ "*",           "-h,    --help, --usage",        "print usage and exit" });
+    options.push_back({ "*",           "       --version",              "show version and build info" });
+    options.push_back({ "*",           "-v,    --verbose",              "print verbose information" });
+    options.push_back({ "*",           "       --verbose-prompt",       "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" });
+    options.push_back({ "*",           "       --no-display-prompt",    "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
+    options.push_back({ "*",           "-co,   --color",                "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
+    options.push_back({ "*",           "-s,    --seed SEED",            "RNG seed (default: %d, use random seed for < 0)", params.seed });
+    options.push_back({ "*",           "-t,    --threads N",            "number of threads to use during generation (default: %d)", params.n_threads });
+    options.push_back({ "*",           "-tb,   --threads-batch N",      "number of threads to use during batch and prompt processing (default: same as --threads)" });
+    options.push_back({ "speculative", "-td,   --threads-draft N",      "number of threads to use during generation (default: same as --threads)" });
+    options.push_back({ "speculative", "-tbd,  --threads-batch-draft N",
+                                                                        "number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
+    options.push_back({ "speculative", "       --draft N",              "number of tokens to draft for speculative decoding (default: %d)", params.n_draft });
+    options.push_back({ "speculative", "-ps,   --p-split N",            "speculative decoding split probability (default: %.1f)", (double)params.p_split });
+    options.push_back({ "*",           "-lcs,  --lookup-cache-static FNAME",
+                                                                        "path to static lookup cache to use for lookup decoding (not updated by generation)" });
+    options.push_back({ "*",           "-lcd,  --lookup-cache-dynamic FNAME",
+                                                                        "path to dynamic lookup cache to use for lookup decoding (updated by generation)" });
+
+    options.push_back({ "*",           "-c,    --ctx-size N",           "size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx });
+    options.push_back({ "*",           "-n,    --predict N",            "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict });
+    options.push_back({ "*",           "-b,    --batch-size N",         "logical maximum batch size (default: %d)", params.n_batch });
+    options.push_back({ "*",           "-ub,   --ubatch-size N",        "physical maximum batch size (default: %d)", params.n_ubatch });
+    options.push_back({ "*",           "       --keep N",               "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep });
+    options.push_back({ "*",           "       --chunks N",             "max number of chunks to process (default: %d, -1 = all)", params.n_chunks });
+    options.push_back({ "*",           "-fa,   --flash-attn",           "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" });
+    options.push_back({ "*",           "-p,    --prompt PROMPT",        "prompt to start generation with (default: '%s')", params.prompt.c_str() });
+    options.push_back({ "*",           "-f,    --file FNAME",           "a file containing the prompt (default: none)" });
+    options.push_back({ "*",           "-bf,   --binary-file FNAME",    "binary file containing the prompt (default: none)" });
+    options.push_back({ "*",           "-e,    --escape",               "process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false" });
+    options.push_back({ "*",           "       --no-escape",            "do not process escape sequences" });
+    options.push_back({ "main",        "-ptc,  --print-token-count N",  "print token count every N tokens (default: %d)", params.n_print });
+    options.push_back({ "main",        "       --prompt-cache FNAME",   "file to cache prompt state for faster startup (default: none)" });
+    options.push_back({ "main",        "       --prompt-cache-all",     "if specified, saves user input and generations to cache as well\n"
+                                                                        "not supported with --interactive or other interactive options" });
+    options.push_back({ "main",        "       --prompt-cache-ro",      "if specified, uses the prompt cache but does not update it" });
+    options.push_back({ "main",        "-r,    --reverse-prompt PROMPT",
+                                                                        "halt generation at PROMPT, return control in interactive mode\n"
+                                                                        "can be specified more than once for multiple prompts" });
+    options.push_back({ "main",        "-sp,   --special",              "special tokens output enabled (default: %s)", params.special ? "true" : "false" });
+    options.push_back({ "main",        "-cnv,  --conversation",         "run in conversation mode (does not print special tokens and suffix/prefix) (default: %s)", params.conversation ? "true" : "false" });
+    options.push_back({ "main infill", "-i,    --interactive",          "run in interactive mode (default: %s)", params.interactive ? "true" : "false" });
+    options.push_back({ "main infill", "-if,   --interactive-first",    "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" });
+    options.push_back({ "main infill", "-mli,  --multiline-input",      "allows you to write or paste multiple lines without ending each in '\\'" });
+    options.push_back({ "main infill", "       --in-prefix-bos",        "prefix BOS to user inputs, preceding the `--in-prefix` string" });
+    options.push_back({ "main infill", "       --in-prefix STRING",     "string to prefix user inputs with (default: empty)" });
+    options.push_back({ "main infill", "       --in-suffix STRING",     "string to suffix after user inputs with (default: empty)" });
+
+    options.push_back({ "sampling" });
+    options.push_back({ "*",           "       --samplers SAMPLERS",    "samplers that will be used for generation in the order, separated by \';\'\n"
+                                                                        "(default: %s)", sampler_type_names.c_str() });
+    options.push_back({ "*",           "       --sampling-seq SEQUENCE",
+                                                                        "simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str() });
+    options.push_back({ "*",           "       --ignore-eos",           "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)" });
+    options.push_back({ "*",           "       --penalize-nl",          "penalize newline tokens (default: %s)", sparams.penalize_nl ? "true" : "false" });
+    options.push_back({ "*",           "       --temp N",               "temperature (default: %.1f)", (double)sparams.temp });
+    options.push_back({ "*",           "       --top-k N",              "top-k sampling (default: %d, 0 = disabled)", sparams.top_k });
+    options.push_back({ "*",           "       --top-p N",              "top-p sampling (default: %.1f, 1.0 = disabled)", (double)sparams.top_p });
+    options.push_back({ "*",           "       --min-p N",              "min-p sampling (default: %.1f, 0.0 = disabled)", (double)sparams.min_p });
+    options.push_back({ "*",           "       --tfs N",                "tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)sparams.tfs_z });
+    options.push_back({ "*",           "       --typical N",            "locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)sparams.typical_p });
+    options.push_back({ "*",           "       --repeat-last-n N",      "last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", sparams.penalty_last_n });
+    options.push_back({ "*",           "       --repeat-penalty N",     "penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)sparams.penalty_repeat });
+    options.push_back({ "*",           "       --presence-penalty N",   "repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_present });
+    options.push_back({ "*",           "       --frequency-penalty N",  "repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_freq });
+    options.push_back({ "*",           "       --dynatemp-range N",     "dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)sparams.dynatemp_range });
+    options.push_back({ "*",           "       --dynatemp-exp N",       "dynamic temperature exponent (default: %.1f)", (double)sparams.dynatemp_exponent });
+    options.push_back({ "*",           "       --mirostat N",           "use Mirostat sampling.\n"
+                                                                        "Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"
+                                                                        "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", sparams.mirostat });
+    options.push_back({ "*",           "       --mirostat-lr N",        "Mirostat learning rate, parameter eta (default: %.1f)", (double)sparams.mirostat_eta });
+    options.push_back({ "*",           "       --mirostat-ent N",       "Mirostat target entropy, parameter tau (default: %.1f)", (double)sparams.mirostat_tau });
+    options.push_back({ "*",           "       -l TOKEN_ID(+/-)BIAS",   "modifies the likelihood of token appearing in the completion,\n"
+                                                                        "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
+                                                                        "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'" });
+    options.push_back({ "main",        "       --cfg-negative-prompt PROMPT",
+                                                                        "negative prompt to use for guidance (default: '%s')", sparams.cfg_negative_prompt.c_str() });
+    options.push_back({ "main",        "       --cfg-negative-prompt-file FNAME",
+                                                                        "negative prompt file to use for guidance" });
+    options.push_back({ "main",        "       --cfg-scale N",          "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale });
+
+    options.push_back({ "grammar" });
+    options.push_back({ "*",           "       --grammar GRAMMAR",      "BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str() });
+    options.push_back({ "*",           "       --grammar-file FNAME",   "file to read grammar from" });
+    options.push_back({ "*",           "-j,    --json-schema SCHEMA",
+                                                                        "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\n"
+                                                                        "For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead" });
+
+    options.push_back({ "embedding" });
+    options.push_back({ "embedding",   "       --pooling {none,mean,cls}",
+                                                                        "pooling type for embeddings, use model default if unspecified" });
+
+    options.push_back({ "context hacking" });
+    options.push_back({ "*",           "       --rope-scaling {none,linear,yarn}",
+                                                                        "RoPE frequency scaling method, defaults to linear unless specified by the model" });
+    options.push_back({ "*",           "       --rope-scale N",         "RoPE context scaling factor, expands context by a factor of N" });
+    options.push_back({ "*",           "       --rope-freq-base N",     "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)" });
+    options.push_back({ "*",           "       --rope-freq-scale N",    "RoPE frequency scaling factor, expands context by a factor of 1/N" });
+    options.push_back({ "*",           "       --yarn-orig-ctx N",      "YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx });
+    options.push_back({ "*",           "       --yarn-ext-factor N",    "YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor });
+    options.push_back({ "*",           "       --yarn-attn-factor N",   "YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor });
+    options.push_back({ "*",           "       --yarn-beta-slow N",     "YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow });
+    options.push_back({ "*",           "       --yarn-beta-fast N",     "YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast });
+    options.push_back({ "*",           "-gan,  --grp-attn-n N",         "group-attention factor (default: %d)", params.grp_attn_n });
+    options.push_back({ "*",           "-gaw,  --grp-attn-w N",         "group-attention width (default: %.1f)", (double)params.grp_attn_w });
+    options.push_back({ "*",           "-dkvc, --dump-kv-cache",        "verbose print of the KV cache" });
+    options.push_back({ "*",           "-nkvo, --no-kv-offload",        "disable KV offload" });
+    options.push_back({ "*",           "-ctk,  --cache-type-k TYPE",    "KV cache data type for K (default: %s)", params.cache_type_k.c_str() });
+    options.push_back({ "*",           "-ctv,  --cache-type-v TYPE",    "KV cache data type for V (default: %s)", params.cache_type_v.c_str() });
+
+    options.push_back({ "perplexity" });
+    options.push_back({ "perplexity",  "       --all-logits",           "return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false" });
+    options.push_back({ "perplexity",  "       --hellaswag",            "compute HellaSwag score over random tasks from datafile supplied with -f" });
+    options.push_back({ "perplexity",  "       --hellaswag-tasks N",    "number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks });
+    options.push_back({ "perplexity",  "       --winogrande",           "compute Winogrande score over random tasks from datafile supplied with -f" });
+    options.push_back({ "perplexity",  "       --winogrande-tasks N",   "number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks });
+    options.push_back({ "perplexity",  "       --multiple-choice",      "compute multiple choice score over random tasks from datafile supplied with -f" });
+    options.push_back({ "perplexity",  "       --multiple-choice-tasks N",
+                                                                        "number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks });
+    options.push_back({ "perplexity",  "       --kl-divergence",        "computes KL-divergence to logits provided via --kl-divergence-base" });
+    options.push_back({ "perplexity",  "       --ppl-stride N",         "stride for perplexity calculation (default: %d)", params.ppl_stride });
+    options.push_back({ "perplexity",  "       --ppl-output-type {0,1}",
+                                                                        "output type for perplexity calculation (default: %d)", params.ppl_output_type });
+
+    options.push_back({ "parallel" });
+    options.push_back({ "*",           "-dt,   --defrag-thold N",       "KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold });
+    options.push_back({ "*",           "-np,   --parallel N",           "number of parallel sequences to decode (default: %d)", params.n_parallel });
+    options.push_back({ "*",           "-ns,   --sequences N",          "number of sequences to decode (default: %d)", params.n_sequences });
+    options.push_back({ "*",           "-cb,   --cont-batching",        "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled" });
+
+    options.push_back({ "multi-modality" });
+    options.push_back({ "*",           "       --mmproj FILE",          "path to a multimodal projector file for LLaVA. see examples/llava/README.md" });
+    options.push_back({ "*",           "       --image FILE",           "path to an image file. use with multimodal models. Specify multiple times for batching" });
+
+    options.push_back({ "backend" });
+    options.push_back({ "*",           "       --rpc SERVERS",          "comma separated list of RPC servers" });
     if (llama_supports_mlock()) {
-        printf("  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
+        options.push_back({ "*",           "       --mlock",                "force system to keep model in RAM rather than swapping or compressing" });
     }
     if (llama_supports_mmap()) {
-        printf("  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
+        options.push_back({ "*",           "       --no-mmap",              "do not memory-map model (slower load but may reduce pageouts if not using mlock)" });
     }
-    printf("  --numa TYPE           attempt optimizations that help on some NUMA systems\n");
-    printf("                          - distribute: spread execution evenly over all nodes\n");
-    printf("                          - isolate: only spawn threads on CPUs on the node that execution started on\n");
-    printf("                          - numactl: use the CPU map provided by numactl\n");
-    printf("                        if run without this previously, it is recommended to drop the system page cache before using this\n");
-    printf("                        see https://github.com/ggerganov/llama.cpp/issues/1437\n");
+    options.push_back({ "*",           "       --numa TYPE",            "attempt optimizations that help on some NUMA systems\n"
+                                                                        "  - distribute: spread execution evenly over all nodes\n"
+                                                                        "  - isolate: only spawn threads on CPUs on the node that execution started on\n"
+                                                                        "  - numactl: use the CPU map provided by numactl\n"
+                                                                        "if run without this previously, it is recommended to drop the system page cache before using this\n"
+                                                                        "see https://github.com/ggerganov/llama.cpp/issues/1437" });
+
     if (llama_supports_gpu_offload()) {
-        printf("  -ngl N, --n-gpu-layers N\n");
-        printf("                        number of layers to store in VRAM\n");
-        printf("  -ngld N, --n-gpu-layers-draft N\n");
-        printf("                        number of layers to store in VRAM for the draft model\n");
-        printf("  -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
-        printf("                        how to split the model across multiple GPUs, one of:\n");
-        printf("                          - none: use one GPU only\n");
-        printf("                          - layer (default): split layers and KV across GPUs\n");
-        printf("                          - row: split rows across GPUs\n");
-        printf("  -ts SPLIT, --tensor-split SPLIT\n");
-        printf("                        fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
-        printf("  -mg i, --main-gpu i   the GPU to use for the model (with split-mode = none),\n");
-        printf("                        or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
+        options.push_back({ "*",           "-ngl,  --gpu-layers N",
+                                                                        "number of layers to store in VRAM" });
+        options.push_back({ "*",           "-ngld, --gpu-layers-draft N",
+                                                                        "number of layers to store in VRAM for the draft model" });
+        options.push_back({ "*",           "-sm,   --split-mode SPLIT_MODE",
+                                                                        "how to split the model across multiple GPUs, one of:\n"
+                                                                        "  - none: use one GPU only\n"
+                                                                        "  - layer (default): split layers and KV across GPUs\n"
+                                                                        "  - row: split rows across GPUs" });
+        options.push_back({ "*",           "-ts,   --tensor-split SPLIT",
+                                                                        "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1" });
+        options.push_back({ "*",           "-mg,   --main-gpu i",       "the GPU to use for the model (with split-mode = none),\n"
+                                                                        "or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu });
     }
-    printf("  --rpc SERVERS         comma separated list of RPC servers\n");
-    printf("  --verbose-prompt      print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
-    printf("  --no-display-prompt   don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
-    printf("  -gan N, --grp-attn-n N\n");
-    printf("                        group-attention factor (default: %d)\n", params.grp_attn_n);
-    printf("  -gaw N, --grp-attn-w N\n");
-    printf("                        group-attention width (default: %.1f)\n", (double)params.grp_attn_w);
-    printf("  -dkvc, --dump-kv-cache\n");
-    printf("                        verbose print of the KV cache\n");
-    printf("  -nkvo, --no-kv-offload\n");
-    printf("                        disable KV offload\n");
-    printf("  -ctk TYPE, --cache-type-k TYPE\n");
-    printf("                        KV cache data type for K (default: %s)\n", params.cache_type_k.c_str());
-    printf("  -ctv TYPE, --cache-type-v TYPE\n");
-    printf("                        KV cache data type for V (default: %s)\n", params.cache_type_v.c_str());
-    printf("  --simple-io           use basic IO for better compatibility in subprocesses and limited consoles\n");
-    printf("  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n");
-    printf("  --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
-    printf("  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
-    printf("  --control-vector FNAME\n");
-    printf("                        add a control vector\n");
-    printf("  --control-vector-scaled FNAME S\n");
-    printf("                        add a control vector with user defined scaling S\n");
-    printf("  --control-vector-layer-range START END\n");
-    printf("                        layer range to apply the control vector(s) to, start and end inclusive\n");
-    printf("  -m FNAME, --model FNAME\n");
-    printf("                        model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise %s)\n", DEFAULT_MODEL_PATH);
-    printf("  -md FNAME, --model-draft FNAME\n");
-    printf("                        draft model for speculative decoding (default: unused)\n");
-    printf("  -mu MODEL_URL, --model-url MODEL_URL\n");
-    printf("                        model download url (default: unused)\n");
-    printf("  -hfr REPO, --hf-repo REPO\n");
-    printf("                        Hugging Face model repository (default: unused)\n");
-    printf("  -hff FILE, --hf-file FILE\n");
-    printf("                        Hugging Face model file (default: unused)\n");
-    printf("  -ld LOGDIR, --logdir LOGDIR\n");
-    printf("                        path under which to save YAML logs (no logging if unset)\n");
-    printf("  -lcs FNAME, --lookup-cache-static FNAME\n");
-    printf("                        path to static lookup cache to use for lookup decoding (not updated by generation)\n");
-    printf("  -lcd FNAME, --lookup-cache-dynamic FNAME\n");
-    printf("                        path to dynamic lookup cache to use for lookup decoding (updated by generation)\n");
-    printf("  --override-kv KEY=TYPE:VALUE\n");
-    printf("                        advanced option to override model metadata by key. may be specified multiple times.\n");
-    printf("                        types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
-    printf("  -ptc N, --print-token-count N\n");
-    printf("                        print token count every N tokens (default: %d)\n", params.n_print);
-    printf("  --check-tensors       check model tensor data for invalid values\n");
-    printf("\n");
+
+    options.push_back({ "model" });
+    options.push_back({ "*",           "       --check-tensors",        "check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false" });
+    options.push_back({ "*",           "       --override-kv KEY=TYPE:VALUE",
+                                                                        "advanced option to override model metadata by key. may be specified multiple times.\n"
+                                                                        "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" });
+    options.push_back({ "*",           "       --lora FNAME",           "apply LoRA adapter (implies --no-mmap)" });
+    options.push_back({ "*",           "       --lora-scaled FNAME S",  "apply LoRA adapter with user defined scaling S (implies --no-mmap)" });
+    options.push_back({ "*",           "       --lora-base FNAME",      "optional model to use as a base for the layers modified by the LoRA adapter" });
+    options.push_back({ "*",           "       --control-vector FNAME", "add a control vector" });
+    options.push_back({ "*",           "       --control-vector-scaled FNAME SCALE",
+                                                                        "add a control vector with user defined scaling SCALE" });
+    options.push_back({ "*",           "       --control-vector-layer-range START END",
+                                                                        "layer range to apply the control vector(s) to, start and end inclusive" });
+    options.push_back({ "*",           "-m,    --model FNAME",          "model path (default: models/$filename with filename from --hf-file\n"
+                                                                        "or --model-url if set, otherwise %s)", DEFAULT_MODEL_PATH });
+    options.push_back({ "*",           "-md,   --model-draft FNAME",    "draft model for speculative decoding (default: unused)" });
+    options.push_back({ "*",           "-mu,   --model-url MODEL_URL",  "model download url (default: unused)" });
+    options.push_back({ "*",           "-hfr,  --hf-repo REPO",         "Hugging Face model repository (default: unused)" });
+    options.push_back({ "*",           "-hff,  --hf-file FILE",         "Hugging Face model file (default: unused)" });
+
+    options.push_back({ "retrieval" });
+    options.push_back({ "retrieval",   "       --context-file FNAME",   "file to load context from (repeat to specify multiple files)" });
+    options.push_back({ "retrieval",   "       --chunk-size N",         "minimum length of embedded text chunks (default: %d)", params.chunk_size });
+    options.push_back({ "retrieval",   "       --chunk-separator STRING",
+                                                                        "separator between chunks (default: '%s')", params.chunk_separator.c_str() });
+
+    options.push_back({ "passkey" });
+    options.push_back({ "passkey",     "       --junk N",               "number of times to repeat the junk text (default: %d)", params.n_junk });
+    options.push_back({ "passkey",     "       --pos N",                "position of the passkey in the junk text (default: %d)", params.i_pos });
+
+    options.push_back({ "bench" });
+    options.push_back({ "bench",       "-pps",                          "is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false" });
+    options.push_back({ "bench",       "-npp n0,n1,...",                "number of prompt tokens" });
+    options.push_back({ "bench",       "-ntg n0,n1,...",                "number of text generation tokens" });
+    options.push_back({ "bench",       "-npl n0,n1,...",                "number of parallel prompts" });
+
+    options.push_back({ "server" });
+    options.push_back({ "server",      "       --host HOST",            "ip address to listen (default: %s)", params.hostname.c_str() });
+    options.push_back({ "server",      "       --port PORT",            "port to listen (default: %d)", params.port });
+    options.push_back({ "server",      "       --path PATH",            "path to serve static files from (default: %s)", params.public_path.c_str() });
+    options.push_back({ "server",      "       --embedding(s)",         "enable embedding endpoint (default: %s)", params.embedding ? "enabled" : "disabled" });
+    options.push_back({ "server",      "       --api-key KEY",          "API key to use for authentication (default: none)" });
+    options.push_back({ "server",      "       --api-key-file FNAME",   "path to file containing API keys (default: none)" });
+    options.push_back({ "server",      "       --ssl-key-file FNAME",   "path to file a PEM-encoded SSL private key" });
+    options.push_back({ "server",      "       --ssl-cert-file FNAME",  "path to file a PEM-encoded SSL certificate" });
+    options.push_back({ "server",      "       --timeout N",            "server read/write timeout in seconds (default: %d)", params.timeout_read });
+    options.push_back({ "server",      "       --system-prompt-file FNAME",
+                                                                        "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications" });
+    options.push_back({ "server",      "       --log-format {text,json}",
+                                                                        "log output format: json or text (default: json)" });
+    options.push_back({ "server",      "       --metrics",              "enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled" });
+    options.push_back({ "server",      "       --no-slots",             "disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled" });
+    options.push_back({ "server",      "       --slot-save-path PATH",  "path to save slot kv cache (default: disabled)" });
+    options.push_back({ "server",      "       --chat-template JINJA_TEMPLATE",
+                                                                        "set custom jinja chat template (default: template taken from model's metadata)\n"
+                                                                        "only commonly used templates are accepted:\n"
+                                                                        "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
+
 #ifndef LOG_DISABLE_LOGS
-    log_print_usage();
+    options.push_back({ "logging" });
+    options.push_back({ "*",           "       --simple-io",            "use basic IO for better compatibility in subprocesses and limited consoles" });
+    options.push_back({ "*",           "-ld,   --logdir LOGDIR",        "path under which to save YAML logs (no logging if unset)" });
+    options.push_back({ "logging",     "       --log-test",             "Run simple logging test" });
+    options.push_back({ "logging",     "       --log-disable",          "Disable trace logs" });
+    options.push_back({ "logging",     "       --log-enable",           "Enable trace logs" });
+    options.push_back({ "logging",     "       --log-file FNAME",       "Specify a log filename (without extension)" });
+    options.push_back({ "logging",     "       --log-new",              "Create a separate new log file on start. "
+                                                                        "Each log file will have unique name: \"<name>.<ID>.log\"" });
+    options.push_back({ "logging",     "       --log-append",           "Don't truncate the old log file." });
 #endif // LOG_DISABLE_LOGS
+
+    printf("usage: %s [options]\n", argv[0]);
+
+    for (const auto & o : options) {
+        if (!o.grp.empty()) {
+            printf("\n%s:\n\n", o.grp.c_str());
+            continue;
+        }
+        printf("  %-32s", o.args.c_str());
+        if (o.args.length() > 30) {
+            printf("\n%34s", "");
+        }
+
+        const auto desc = o.desc;
+        size_t start = 0;
+        size_t end = desc.find('\n');
+        while (end != std::string::npos) {
+            printf("%s\n%34s", desc.substr(start, end - start).c_str(), "");
+            start = end + 1;
+            end = desc.find('\n', start);
+        }
+
+        printf("%s\n", desc.substr(start).c_str());
+    }
+    printf("\n");
 }
 
 std::string gpt_params_get_system_info(const gpt_params & params) {
@@ -1610,24 +1928,6 @@ std::string string_get_sortable_timestamp() {
     return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
 }
 
-std::string string_random_prompt(std::mt19937 & rng) {
-    const int r = rng() % 10;
-    switch (r) {
-        case 0: return "So";
-        case 1: return "Once upon a time";
-        case 2: return "When";
-        case 3: return "The";
-        case 4: return "After";
-        case 5: return "If";
-        case 6: return "import";
-        case 7: return "He";
-        case 8: return "She";
-        case 9: return "They";
-    }
-
-    GGML_UNREACHABLE();
-}
-
 void string_process_escapes(std::string & input) {
     std::size_t input_len = input.length();
     std::size_t output_idx = 0;
@@ -2503,6 +2803,12 @@ bool llama_should_add_bos_token(const llama_model * model) {
     return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
 }
 
+bool llama_chat_verify_template(const std::string & tmpl) {
+    llama_chat_message chat[] = {{"user", "test"}};
+    int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
+    return res >= 0;
+}
+
 //
 // KV cache utils
 //
@@ -2902,9 +3208,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
     yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str());
     fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
     yaml_dump_string_multiline(stream, "in_suffix", params.input_prefix.c_str());
-    fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false");
     fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
-    fprintf(stream, "interactive_specials: %s # default: false\n", params.interactive_specials ? "true" : "false");
     fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
     fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
     fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
@@ -2954,7 +3258,6 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
     fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
     yaml_dump_vector_int(stream, "prompt_tokens", prompt_tokens);
-    fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
     fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
 
     fprintf(stream, "reverse_prompt:\n");
diff --git a/common/common.h b/common/common.h
index 264504830..e0a08a61b 100644
--- a/common/common.h
+++ b/common/common.h
@@ -60,7 +60,7 @@ struct gpt_params {
     int32_t n_threads_batch       = -1;    // number of threads to use for batch processing (-1 = use n_threads)
     int32_t n_threads_batch_draft = -1;
     int32_t n_predict             = -1;    // new tokens to predict
-    int32_t n_ctx                 = 512;   // context size
+    int32_t n_ctx                 = 0;     // context size
     int32_t n_batch               = 2048;  // logical batch size for prompt processing (must be >=32 to use BLAS)
     int32_t n_ubatch              = 512;   // physical batch size for prompt processing (must be >=32 to use BLAS)
     int32_t n_keep                = 0;     // number of tokens to keep from initial prompt
@@ -99,23 +99,23 @@ struct gpt_params {
     // // sampling parameters
     struct llama_sampling_params sparams;
 
-    std::string model                = "";  // model path
-    std::string model_draft          = "";  // draft model for speculative decoding
+    std::string model                = ""; // model path
+    std::string model_draft          = ""; // draft model for speculative decoding
     std::string model_alias          = "unknown"; // model alias
-    std::string model_url            = "";  // model url to download
-    std::string hf_repo              = "";  // HF repo
-    std::string hf_file              = "";  // HF file
+    std::string model_url            = ""; // model url to download
+    std::string hf_repo              = ""; // HF repo
+    std::string hf_file              = ""; // HF file
     std::string prompt               = "";
-    std::string prompt_file          = "";  // store the external prompt file name
-    std::string path_prompt_cache    = "";  // path to file for saving/loading prompt eval state
-    std::string input_prefix         = "";  // string to prefix user inputs with
-    std::string input_suffix         = "";  // string to suffix user inputs with
-    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
-    std::string logdir               = "";  // directory in which to save YAML log files
+    std::string prompt_file          = ""; // store the external prompt file name
+    std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state
+    std::string input_prefix         = ""; // string to prefix user inputs with
+    std::string input_suffix         = ""; // string to suffix user inputs with
+    std::string logdir               = ""; // directory in which to save YAML log files
     std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding
     std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
-    std::string logits_file          = "";  // file for saving *all* logits
+    std::string logits_file          = ""; // file for saving *all* logits
 
+    std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
     std::vector<llama_model_kv_override> kv_overrides;
 
     // TODO: avoid tuple, use struct
@@ -127,8 +127,8 @@ struct gpt_params {
     int32_t control_vector_layer_start = -1; // layer range for control vector
     int32_t control_vector_layer_end   = -1; // layer range for control vector
 
-    int  ppl_stride        = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
-    int  ppl_output_type   = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
+    int32_t ppl_stride      = 0;    // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
+    int32_t ppl_output_type = 0;    // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
                                     //                                       (which is more convenient to use for plotting)
                                     //
     bool   hellaswag       = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
@@ -142,19 +142,17 @@ struct gpt_params {
 
     bool   kl_divergence   = false; // compute KL divergence
 
-    bool random_prompt     = false; // do not randomize prompt if none provided
+    bool usage             = false; // print usage
     bool use_color         = false; // use color to distinguish generations and inputs
-    bool interactive       = false; // interactive mode
-    bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
     bool special           = false; // enable special token output
+    bool interactive       = false; // interactive mode
+    bool interactive_first = false; // wait for user input immediately
     bool conversation      = false; // conversation mode (does not print special tokens and suffix/prefix)
-    bool chatml            = false; // chatml mode (used for models trained on chatml syntax)
     bool prompt_cache_all  = false; // save user input and generations to prompt cache
     bool prompt_cache_ro   = false; // open the prompt cache read-only and do not update it
 
     bool embedding         = false; // get only sentence embedding
-    bool escape            = false; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
-    bool interactive_first = false; // wait for user input immediately
+    bool escape            = true;  // escape "\n", "\r", "\t", "\'", "\"", and "\\"
     bool multiline_input   = false; // reverse the usage of `\`
     bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
     bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
@@ -162,10 +160,10 @@ struct gpt_params {
 
     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
     bool ignore_eos        = false; // ignore generated EOS tokens
-    bool instruct          = false; // instruction mode (used for Alpaca models)
     bool logits_all        = false; // return logits for all tokens in the batch
     bool use_mmap          = true;  // use mmap for faster loads
     bool use_mlock         = false; // use mlock to keep model in memory
+    bool verbose           = false;
     bool verbose_prompt    = false; // print prompt tokens before generation
     bool display_prompt    = true;  // print prompt before generation
     bool infill            = false; // use infill mode
@@ -180,6 +178,47 @@ struct gpt_params {
     // multimodal models (see examples/llava)
     std::string mmproj = "";        // path to multimodal projector
     std::vector<std::string> image; // path to image file(s)
+
+    // server params
+    int32_t port           = 8080;
+    int32_t timeout_read   = 600;
+    int32_t timeout_write  = timeout_read;
+    int32_t n_threads_http = -1;
+
+    std::string hostname      = "127.0.0.1";
+    std::string public_path   = "";
+    std::string chat_template = "";
+    std::string system_prompt = "";
+
+    std::vector<std::string> api_keys;
+
+    std::string ssl_file_key  = "";
+    std::string ssl_file_cert = "";
+
+    bool endpoint_slots   = true;
+    bool endpoint_metrics = false;
+
+    bool log_json = false;
+
+    std::string slot_save_path;
+
+    // batched-bench params
+    bool is_pp_shared = false;
+
+    std::vector<int32_t> n_pp;
+    std::vector<int32_t> n_tg;
+    std::vector<int32_t> n_pl;
+
+    // retrieval params
+    std::vector<std::string> context_files; // context files to embed
+
+    int32_t chunk_size = 64; // chunk size for context embedding
+
+    std::string chunk_separator = "\n"; // chunk separator for context embedding
+
+    // passkey params
+    int32_t n_junk = 250; // number of times to repeat the junk text
+    int32_t i_pos  = -1;  // position of the passkey in the junk text
 };
 
 void gpt_params_handle_model_default(gpt_params & params);
@@ -199,7 +238,20 @@ std::vector<std::string> string_split(std::string input, char separator);
 
 std::string string_strip(const std::string & str);
 std::string string_get_sortable_timestamp();
-std::string string_random_prompt(std::mt19937 & rng);
+
+template<class T>
+static std::vector<T> string_split(const std::string & str, char delim) {
+    std::vector<T> values;
+    std::istringstream str_stream(str);
+    std::string token;
+    while (std::getline(str_stream, token, delim)) {
+        T value;
+        std::istringstream token_stream(token);
+        token_stream >> value;
+        values.push_back(value);
+    }
+    return values;
+}
 
 bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
 void string_process_escapes(std::string & input);
@@ -282,6 +334,13 @@ std::string llama_detokenize_bpe(
 // defaults to true when model type is SPM, otherwise false.
 bool llama_should_add_bos_token(const llama_model * model);
 
+//
+// Chat template utils
+//
+
+// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
+bool llama_chat_verify_template(const std::string & tmpl);
+
 //
 // KV cache utils
 //
diff --git a/examples/batched-bench/README.md b/examples/batched-bench/README.md
index bf951baf7..fa4baf640 100644
--- a/examples/batched-bench/README.md
+++ b/examples/batched-bench/README.md
@@ -10,16 +10,16 @@ There are 2 modes of operation:
 - `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
 
 ```bash
-./batched-bench MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>
+./batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
 
 # LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
-./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 2048 512 0 99
+./batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
 
 # LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
-./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 2048 512 1 99
+./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
 
 # custom set of batches
-./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 512 512 0 999 0 128,256,512 128,256 1,2,4,8,16,32
+./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
 ```
 
 ## Sample results
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index 2924d8116..718f0a61a 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -28,67 +28,27 @@ static std::vector<int> parse_list(char * p) {
     return ret;
 }
 
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+    gpt_params_print_usage(argc, argv, params);
+
+    LOG_TEE("\nexample usage:\n");
+    LOG_TEE("\n    %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
+    LOG_TEE("\n");
+}
+
 int main(int argc, char ** argv) {
     gpt_params params;
 
-    if (argc == 1 || argv[1][0] == '-') {
-        printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [FATTN] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
-        printf("  <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
-        printf("  example: %s ggml-model-f16.gguf 2048 2048 512 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
-        return 1 ;
+    if (!gpt_params_parse(argc, argv, params)) {
+        print_usage(argc, argv, params);
+        return 1;
     }
 
-    int n_kv_max     = 2048;
-    int n_batch      = 2048;
-    int n_ubatch     = 512;
-    bool flash_attn  = false;
-    int is_pp_shared = 0;
-    int n_gpu_layers = 0;
+    int is_pp_shared = params.is_pp_shared;
 
-    std::vector<int> n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, };
-    std::vector<int> n_tg = { 128, 256, };
-    std::vector<int> n_pl = { 1, 2, 4, 8, 16, 32, };
-    //std::vector<int> n_pl = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32, };
-
-    if (argc >= 2) {
-        params.model = argv[1];
-    }
-
-    if (argc >= 3) {
-        n_kv_max = std::atoi(argv[2]);
-    }
-
-    if (argc >= 4) {
-        n_batch = std::atoi(argv[3]);
-    }
-
-    if (argc >= 5) {
-        n_ubatch = std::atoi(argv[4]);
-    }
-
-    if (argc >= 6) {
-        flash_attn = std::atoi(argv[5]);
-    }
-
-    if (argc >= 7) {
-        is_pp_shared = std::atoi(argv[6]);
-    }
-
-    if (argc >= 8) {
-        n_gpu_layers = std::atoi(argv[7]);
-    }
-
-    if (argc >= 9) {
-        n_pp = parse_list(argv[8]);
-    }
-
-    if (argc >= 10) {
-        n_tg = parse_list(argv[9]);
-    }
-
-    if (argc >= 11) {
-        n_pl = parse_list(argv[10]);
-    }
+    std::vector<int> n_pp = params.n_pp;
+    std::vector<int> n_tg = params.n_tg;
+    std::vector<int> n_pl = params.n_pl;
 
     // init LLM
 
@@ -97,12 +57,7 @@ int main(int argc, char ** argv) {
 
     // initialize the model
 
-    llama_model_params model_params = llama_model_default_params();
-
-    const std::vector<float> t_split(llama_max_devices(), 0.0f);
-
-    model_params.n_gpu_layers = n_gpu_layers;
-    model_params.tensor_split = t_split.data();
+    llama_model_params model_params = llama_model_params_from_gpt_params(params);
 
     llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
 
@@ -111,16 +66,7 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    llama_context_params ctx_params = llama_context_default_params();
-
-    ctx_params.seed       = 1234;
-    ctx_params.n_ctx      = n_kv_max;
-    ctx_params.n_batch    = n_batch;
-    ctx_params.n_ubatch   = n_ubatch;
-    ctx_params.flash_attn = flash_attn;
-
-    ctx_params.n_threads       = params.n_threads;
-    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+    llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
 
     // ensure enough sequences are available
     ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end());
@@ -132,6 +78,8 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    const int32_t n_kv_max = llama_n_ctx(ctx);
+
     llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
 
     // decode in batches of ctx_params.n_batch tokens
@@ -175,7 +123,7 @@ int main(int argc, char ** argv) {
     }
 
     LOG_TEE("\n");
-    LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, flash_attn, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
+    LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
     LOG_TEE("\n");
 
     LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP",     "TG",     "B",    "N_KV",     "T_PP s",   "S_PP t/s", "T_TG s",   "S_TG t/s", "T s",      "S t/s");
diff --git a/examples/batched/README.md b/examples/batched/README.md
index 5d7303317..ed204c308 100644
--- a/examples/batched/README.md
+++ b/examples/batched/README.md
@@ -3,7 +3,7 @@
 The example demonstrates batched generation from a given prompt
 
 ```bash
-./batched ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is" 4
+./batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4
 
 ...
 
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
index 591bc6e57..62d9b144d 100644
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -7,48 +7,31 @@
 #include <string>
 #include <vector>
 
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+    gpt_params_print_usage(argc, argv, params);
+
+    LOG_TEE("\nexample usage:\n");
+    LOG_TEE("\n    %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
+    LOG_TEE("\n");
+}
+
 int main(int argc, char ** argv) {
     gpt_params params;
 
-    if (argc == 1 || argv[1][0] == '-') {
-        printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN] [NGL]\n" , argv[0]);
-        return 1 ;
+    params.prompt = "Hello my name is";
+    params.n_predict = 32;
+
+    if (!gpt_params_parse(argc, argv, params)) {
+        print_usage(argc, argv, params);
+        return 1;
     }
 
+
     // number of parallel batches
-    int n_parallel = 1;
+    int n_parallel = params.n_parallel;
 
     // total length of the sequences including the prompt
-    int n_len = 32;
-
-    // number of layers to offload to the GPU
-    int n_gpu_layers = 0;
-
-    if (argc >= 2) {
-        params.model = argv[1];
-    }
-
-    if (argc >= 3) {
-        params.prompt = argv[2];
-    }
-
-    if (argc >= 4) {
-        n_parallel = std::atoi(argv[3]);
-    }
-
-    if (argc >= 5) {
-        n_len = std::atoi(argv[4]);
-    }
-
-    if (argc >= 6) {
-        n_gpu_layers = std::atoi(argv[5]);
-    }
-
-    if (params.prompt.empty()) {
-        params.prompt = "Hello my name is";
-    }
-
-    string_process_escapes(params.prompt);
+    int n_predict = 32;
 
     // init LLM
 
@@ -57,9 +40,7 @@ int main(int argc, char ** argv) {
 
     // initialize the model
 
-    llama_model_params model_params = llama_model_default_params();
-
-    model_params.n_gpu_layers = n_gpu_layers;
+    llama_model_params model_params = llama_model_params_from_gpt_params(params);
 
     llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
 
@@ -73,18 +54,14 @@ int main(int argc, char ** argv) {
     std::vector<llama_token> tokens_list;
     tokens_list = ::llama_tokenize(model, params.prompt, true);
 
-    const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel;
+    const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel;
 
     // initialize the context
 
-    llama_context_params ctx_params = llama_context_default_params();
+    llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
 
-    ctx_params.seed  = 1234;
     ctx_params.n_ctx   = n_kv_req;
-    ctx_params.n_batch = std::max(n_len, n_parallel);
-    ctx_params.n_seq_max       = n_parallel;
-    ctx_params.n_threads       = params.n_threads;
-    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+    ctx_params.n_batch = std::max(n_predict, n_parallel);
 
     llama_context * ctx = llama_new_context_with_model(model, ctx_params);
 
@@ -93,9 +70,9 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    const int n_ctx    = llama_n_ctx(ctx);
+    const int n_ctx = llama_n_ctx(ctx);
 
-    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
+    LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
 
     // make sure the KV cache is big enough to hold all the prompt and generated tokens
     if (n_kv_req > n_ctx) {
@@ -156,7 +133,7 @@ int main(int argc, char ** argv) {
 
     const auto t_main_start = ggml_time_us();
 
-    while (n_cur <= n_len) {
+    while (n_cur <= n_predict) {
         // prepare the next batch
         llama_batch_clear(batch);
 
@@ -192,7 +169,7 @@ int main(int argc, char ** argv) {
             //const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
 
             // is it an end of generation? -> mark the stream as finished
-            if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
+            if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
                 i_batch[i] = -1;
                 LOG_TEE("\n");
                 if (n_parallel > 1) {
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 004399b5f..244751e00 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -63,6 +63,7 @@ int main(int argc, char ** argv) {
     gpt_params params;
 
     if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
         return 1;
     }
 
@@ -79,9 +80,6 @@ int main(int argc, char ** argv) {
     fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
 
     std::mt19937 rng(params.seed);
-    if (params.random_prompt) {
-        params.prompt = string_random_prompt(rng);
-    }
 
     llama_backend_init();
     llama_numa_init(params.numa);
diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
index 51d67d6d9..64cd338c2 100644
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -140,20 +140,18 @@ static bool run(llama_context * ctx, const gpt_params & params) {
 }
 
 int main(int argc, char ** argv) {
-
     callback_data cb_data;
 
     gpt_params params;
+
     if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
         return 1;
     }
 
     print_build_info();
 
     std::mt19937 rng(params.seed);
-    if (params.random_prompt) {
-        params.prompt = string_random_prompt(rng);
-    }
 
     llama_backend_init();
     llama_numa_init(params.numa);
diff --git a/examples/gguf-split/tests.sh b/examples/gguf-split/tests.sh
index 7ca6fa7f2..3bc0fa471 100755
--- a/examples/gguf-split/tests.sh
+++ b/examples/gguf-split/tests.sh
@@ -41,7 +41,7 @@ echo PASS
 echo
 
 # 2b. Test the sharded model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --random-prompt --n-predict 32
+$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --n-predict 32
 echo PASS
 echo
 
@@ -51,7 +51,7 @@ echo PASS
 echo
 
 # 3b. Test the merged model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-merge.gguf --random-prompt --n-predict 32
+$MAIN --model $WORK_PATH/ggml-model-merge.gguf --n-predict 32
 echo PASS
 echo
 
@@ -61,7 +61,7 @@ echo PASS
 echo
 
 # 4b. Test the sharded model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --random-prompt --n-predict 32
+$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --n-predict 32
 echo PASS
 echo
 
@@ -71,7 +71,7 @@ echo
 #echo
 
 # 5b. Test the merged model is loading properly
-#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --random-prompt --n-predict 32
+#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --n-predict 32
 #echo PASS
 #echo
 
@@ -81,7 +81,7 @@ echo PASS
 echo
 
 # 6b. Test the sharded model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --random-prompt --n-predict 32
+$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --n-predict 32
 echo PASS
 echo
 
diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp
index 52fd719b3..213515791 100644
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -153,7 +153,9 @@ static std::string gritlm_instruction(const std::string & instruction) {
 
 int main(int argc, char * argv[]) {
     gpt_params params;
+
     if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
         return 1;
     }
 
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index 25a2351cc..e050c09d2 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -533,7 +533,6 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
 }
 
 int main(int argc, char ** argv) {
-
     StatParams sparams;
     std::string prev_result_file;
     std::string combine_files;
@@ -581,7 +580,9 @@ int main(int argc, char ** argv) {
 
     gpt_params params;
     params.n_batch = 512;
-    if (!gpt_params_parse(args.size(), args.data(), params)) {
+
+    if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
         return 1;
     }
 
@@ -597,9 +598,6 @@ int main(int argc, char ** argv) {
     fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
 
     std::mt19937 rng(params.seed);
-    if (params.random_prompt) {
-        params.prompt = string_random_prompt(rng);
-    }
 
     sparams.dataset = params.prompt_file;
     g_collector.set_parameters(std::move(sparams));
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
index 539f78184..0e4ec79c6 100644
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -107,6 +107,7 @@ int main(int argc, char ** argv) {
     g_params = &params;
 
     if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
         return 1;
     }
 
@@ -139,27 +140,6 @@ int main(int argc, char ** argv) {
         LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
         params.n_ctx = 8;
     }
-    if (params.instruct) {
-        printf("\n************\n");
-        printf("%s: please use the 'main' tool for instruct mode\n", __func__);
-        printf("************\n\n");
-
-        return 0;
-    }
-    if (params.chatml) {
-        printf("\n************\n");
-        printf("%s: please use the 'main' tool for chatml mode\n", __func__);
-        printf("************\n\n");
-
-        return 0;
-    }
-    if (!params.antiprompt.empty()) {
-        printf("\n************\n");
-        printf("%s: please use the 'main' tool for antiprompt mode\n", __func__);
-        printf("************\n\n");
-
-        return 0;
-    }
     if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
         printf("\n************\n");
         printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
@@ -167,20 +147,6 @@ int main(int argc, char ** argv) {
 
         return 0;
     }
-    if (params.random_prompt) {
-        printf("\n************\n");
-        printf("%s: please use the 'main' tool for random prompt mode\n", __func__);
-        printf("************\n\n");
-
-        return 0;
-    }
-    if (!params.path_prompt_cache.empty()) {
-        printf("\n************\n");
-        printf("%s: infill does not support prompt caching\n", __func__);
-        printf("************\n\n");
-
-        return 0;
-    }
 
     if (params.rope_freq_base != 0.0) {
         LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
@@ -207,17 +173,13 @@ int main(int argc, char ** argv) {
 
     llama_model * model;
     llama_context * ctx;
-    llama_context * ctx_guidance = NULL;
+
     g_model = &model;
     g_ctx = &ctx;
 
     // load the model and apply lora adapter, if any
     LOG("%s: load the model and apply lora adapter, if any\n", __func__);
     std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    if (sparams.cfg_scale > 1.f) {
-        struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
-        ctx_guidance = llama_new_context_with_model(model, lparams);
-    }
 
     if (model == NULL) {
         LOG_TEE("%s: error: unable to load model\n", __func__);
@@ -273,25 +235,6 @@ int main(int argc, char ** argv) {
         LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
     }
 
-    // Tokenize negative prompt
-    std::vector<llama_token> guidance_inp;
-    int guidance_offset = 0;
-    int original_prompt_len = 0;
-    if (ctx_guidance) {
-        LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
-
-        guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true);
-        LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
-
-        std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true);
-        LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
-
-        original_prompt_len = original_inp.size();
-        guidance_offset = (int)guidance_inp.size() - original_prompt_len;
-        LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
-        LOG("guidance_offset:     %s", log_tostr(guidance_offset));
-    }
-
     if ((int) embd_inp.size() > n_ctx - 4) {
         LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
         return 1;
@@ -319,15 +262,6 @@ int main(int argc, char ** argv) {
             LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
         }
 
-        if (ctx_guidance) {
-            LOG_TEE("\n");
-            LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
-            LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
-            for (int i = 0; i < (int) guidance_inp.size(); i++) {
-                LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
-            }
-        }
-
         if (params.n_keep > 0) {
         LOG_TEE("%s: static prompt based on n_keep: '", __func__);
             for (int i = 0; i < params.n_keep; i++) {
@@ -395,12 +329,11 @@ int main(int argc, char ** argv) {
         is_interacting = params.interactive_first;
     }
 
-    bool input_echo           = true;
+    bool input_echo = true;
 
-    int n_past             = 0;
-    int n_remain           = params.n_predict;
-    int n_consumed         = 0;
-    int n_past_guidance    = 0;
+    int n_past     = 0;
+    int n_remain   = params.n_predict;
+    int n_consumed = 0;
 
     std::vector<int>   input_tokens;  g_input_tokens  = &input_tokens;
     std::vector<int>   output_tokens; g_output_tokens = &output_tokens;
@@ -410,7 +343,6 @@ int main(int argc, char ** argv) {
     console::set_display(console::prompt);
 
     std::vector<llama_token> embd;
-    std::vector<llama_token> embd_guidance;
 
     struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
 
@@ -436,7 +368,7 @@ int main(int argc, char ** argv) {
             // if we run out of context:
             // - take the n_keep first tokens from the original prompt (via n_past)
             // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
-            if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
+            if (n_past + (int) embd.size() > n_ctx) {
                 if (params.n_predict == -2) {
                     LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
                     break;
@@ -453,11 +385,7 @@ int main(int argc, char ** argv) {
 
                 n_past -= n_discard;
 
-                if (ctx_guidance) {
-                    n_past_guidance -= n_discard;
-                }
-
-                LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
+                LOG("after swap: n_past = %d\n", n_past);
 
                 LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
 
@@ -465,45 +393,6 @@ int main(int argc, char ** argv) {
 
             // evaluate tokens in batches
             // embd is typically prepared beforehand to fit within a batch, but not always
-
-            if (ctx_guidance) {
-                int input_size = 0;
-                llama_token * input_buf = NULL;
-
-                if (n_past_guidance < (int) guidance_inp.size()) {
-                    // Guidance context should have the same data with these modifications:
-                    //
-                    // * Replace the initial prompt
-                    // * Shift everything by guidance_offset
-                    embd_guidance = guidance_inp;
-                    if (embd.begin() + original_prompt_len < embd.end()) {
-                        embd_guidance.insert(
-                            embd_guidance.end(),
-                            embd.begin() + original_prompt_len,
-                            embd.end()
-                        );
-                    }
-
-                    input_buf  = embd_guidance.data();
-                    input_size = embd_guidance.size();
-
-                    LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
-                } else {
-                    input_buf  = embd.data();
-                    input_size = embd.size();
-                }
-
-                for (int i = 0; i < input_size; i += params.n_batch) {
-                    int n_eval = std::min(input_size - i, params.n_batch);
-                    if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) {
-                        LOG_TEE("%s : failed to eval\n", __func__);
-                        return 1;
-                    }
-
-                    n_past_guidance += n_eval;
-                }
-            }
-
             for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
                 int n_eval = (int) embd.size() - i;
                 if (n_eval > params.n_batch) {
@@ -525,11 +414,9 @@ int main(int argc, char ** argv) {
         }
 
         embd.clear();
-        embd_guidance.clear();
 
         if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
-
-            const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
+            const llama_token id = llama_sampling_sample(ctx_sampling, ctx, nullptr);
 
             llama_sampling_accept(ctx_sampling, ctx, id, true);
 
@@ -583,7 +470,6 @@ int main(int argc, char ** argv) {
 
         // if not currently processing queued inputs;
         if ((int) embd_inp.size() <= n_consumed) {
-
             // deal with eot token in infill mode
             if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
                 if (is_interacting && !params.interactive_first) {
@@ -644,7 +530,6 @@ int main(int argc, char ** argv) {
                 embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
                 embd_inp.push_back(llama_token_middle(model));
                 embd.clear();
-                embd_guidance.clear();
                 n_remain = params.n_predict;
                 n_past = 0;
                 n_consumed = 0;
@@ -751,7 +636,6 @@ int main(int argc, char ** argv) {
     llama_print_timings(ctx);
     write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
 
-    if (ctx_guidance) { llama_free(ctx_guidance); }
     llama_free(ctx);
     llama_free_model(model);
 
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index fa7ad1bdb..5c31548a6 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -41,20 +41,6 @@ static std::string join(const std::vector<T> & values, const std::string & delim
     return str.str();
 }
 
-template<class T>
-static std::vector<T> split(const std::string & str, char delim) {
-    std::vector<T> values;
-    std::istringstream str_stream(str);
-    std::string token;
-    while (std::getline(str_stream, token, delim)) {
-        T value;
-        std::istringstream token_stream(token);
-        token_stream >> value;
-        values.push_back(value);
-    }
-    return values;
-}
-
 template<typename T, typename F>
 static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) {
     std::vector<std::string> str_values;
@@ -322,28 +308,28 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                 invalid_param = true;
                 break;
             }
-            auto p = split<std::string>(argv[i], split_delim);
+            auto p = string_split<std::string>(argv[i], split_delim);
             params.model.insert(params.model.end(), p.begin(), p.end());
         } else if (arg == "-p" || arg == "--n-prompt") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            auto p = split<int>(argv[i], split_delim);
+            auto p = string_split<int>(argv[i], split_delim);
             params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end());
         } else if (arg == "-n" || arg == "--n-gen") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            auto p = split<int>(argv[i], split_delim);
+            auto p = string_split<int>(argv[i], split_delim);
             params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
         } else if (arg == "-pg") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            auto p = split<std::string>(argv[i], ',');
+            auto p = string_split<std::string>(argv[i], ',');
             if (p.size() != 2) {
                 invalid_param = true;
                 break;
@@ -354,21 +340,21 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                 invalid_param = true;
                 break;
             }
-            auto p = split<int>(argv[i], split_delim);
+            auto p = string_split<int>(argv[i], split_delim);
             params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
         } else if (arg == "-ub" || arg == "--ubatch-size") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            auto p = split<int>(argv[i], split_delim);
+            auto p = string_split<int>(argv[i], split_delim);
             params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end());
         } else if (arg == "-ctk" || arg == "--cache-type-k") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            auto p = split<std::string>(argv[i], split_delim);
+            auto p = string_split<std::string>(argv[i], split_delim);
             std::vector<ggml_type> types;
             for (const auto & t : p) {
                 ggml_type gt = ggml_type_from_name(t);
@@ -384,7 +370,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                 invalid_param = true;
                 break;
             }
-            auto p = split<std::string>(argv[i], split_delim);
+            auto p = string_split<std::string>(argv[i], split_delim);
             std::vector<ggml_type> types;
             for (const auto & t : p) {
                 ggml_type gt = ggml_type_from_name(t);
@@ -400,14 +386,14 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                 invalid_param = true;
                 break;
             }
-            auto p = split<int>(argv[i], split_delim);
+            auto p = string_split<int>(argv[i], split_delim);
             params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
         } else if (arg == "-ngl" || arg == "--n-gpu-layers") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            auto p = split<int>(argv[i], split_delim);
+            auto p = string_split<int>(argv[i], split_delim);
             params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
         } else if (arg == "-rpc" || arg == "--rpc") {
             if (++i >= argc) {
@@ -420,7 +406,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                 invalid_param = true;
                 break;
             }
-            auto p = split<std::string>(argv[i], split_delim);
+            auto p = string_split<std::string>(argv[i], split_delim);
             std::vector<llama_split_mode> modes;
             for (const auto & m : p) {
                 llama_split_mode mode;
@@ -442,13 +428,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                 invalid_param = true;
                 break;
             }
-            params.main_gpu = split<int>(argv[i], split_delim);
+            params.main_gpu = string_split<int>(argv[i], split_delim);
         } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            auto p = split<bool>(argv[i], split_delim);
+            auto p = string_split<bool>(argv[i], split_delim);
             params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
         } else if (arg == "--numa") {
             if (++i >= argc) {
@@ -466,28 +452,28 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                 invalid_param = true;
                 break;
             }
-            auto p = split<bool>(argv[i], split_delim);
+            auto p = string_split<bool>(argv[i], split_delim);
             params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
         } else if (arg == "-mmp" || arg == "--mmap") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            auto p = split<bool>(argv[i], split_delim);
+            auto p = string_split<bool>(argv[i], split_delim);
             params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
         } else if (arg == "-embd" || arg == "--embeddings") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            auto p = split<bool>(argv[i], split_delim);
+            auto p = string_split<bool>(argv[i], split_delim);
             params.embeddings.insert(params.embeddings.end(), p.begin(), p.end());
         } else if (arg == "-ts" || arg == "--tensor-split") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            for (auto ts : split<std::string>(argv[i], split_delim)) {
+            for (auto ts : string_split<std::string>(argv[i], split_delim)) {
                 // split string by ; and /
                 const std::regex regex{R"([;/]+)"};
                 std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1};
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
index c974900f2..8c7dd2ae3 100644
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -112,9 +112,12 @@ struct llava_context {
     struct llama_model * model = NULL;
 };
 
-static void show_additional_info(int /*argc*/, char ** argv) {
-    LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
-    LOG_TEE("  note: a lower temperature value like 0.1 is recommended for better quality.\n");
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+    gpt_params_print_usage(argc, argv, params);
+
+    LOG_TEE("\n example usage:\n");
+    LOG_TEE("\n     %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+    LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
 }
 
 static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) {
@@ -278,7 +281,7 @@ int main(int argc, char ** argv) {
     gpt_params params;
 
     if (!gpt_params_parse(argc, argv, params)) {
-        show_additional_info(argc, argv);
+        print_usage(argc, argv, params);
         return 1;
     }
 
@@ -290,8 +293,7 @@ int main(int argc, char ** argv) {
 #endif // LOG_DISABLE_LOGS
 
     if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
-        gpt_params_print_usage(argc, argv, params);
-        show_additional_info(argc, argv);
+        print_usage(argc, argv, {});
         return 1;
     }
     auto model = llava_init(&params);
diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp
index 54f060a85..fb20ad93f 100644
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -37,7 +37,8 @@ struct ngram_container {
 int main(int argc, char ** argv) {
     gpt_params params;
 
-    if (gpt_params_parse(argc, argv, params) == false) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
         return 1;
     }
 
diff --git a/examples/lookup/lookup-create.cpp b/examples/lookup/lookup-create.cpp
index 1c230c966..d713f6f21 100644
--- a/examples/lookup/lookup-create.cpp
+++ b/examples/lookup/lookup-create.cpp
@@ -14,8 +14,10 @@ int main(int argc, char ** argv){
     gpt_params params;
 
     if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
         return 1;
     }
+
     // init llama.cpp
     llama_backend_init();
     llama_numa_init(params.numa);
diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp
index 87ecc0a4f..0b171c872 100644
--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@@ -16,6 +16,7 @@ int main(int argc, char ** argv){
     gpt_params params;
 
     if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
         return 1;
     }
 
diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp
index 83dbee91a..80ecd925d 100644
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -15,6 +15,7 @@ int main(int argc, char ** argv){
     gpt_params params;
 
     if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
         return 1;
     }
 
diff --git a/examples/main/README.md b/examples/main/README.md
index ee930f4e7..4eaa68475 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -53,13 +53,13 @@ The following command generates "infinite" text from a starting prompt (you can
 #### Unix-based systems (Linux, macOS, etc.):
 
 ```bash
-./main -m models/7B/ggml-model.bin --ignore-eos -n -1 --random-prompt
+./main -m models/7B/ggml-model.bin --ignore-eos -n -1
 ```
 
 #### Windows:
 
 ```powershell
-main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt
+main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1
 ```
 
 ## Common Options
@@ -80,7 +80,6 @@ The `main` program provides several ways to interact with the LLaMA models using
 -   `--prompt PROMPT`: Provide a prompt directly as a command-line option.
 -   `--file FNAME`: Provide a file containing a prompt or multiple prompts.
 -   `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.)
--   `--random-prompt`: Start with a randomized prompt.
 
 ## Interaction
 
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 44949ba86..b97b7b793 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -122,8 +122,10 @@ int main(int argc, char ** argv) {
     g_params = &params;
 
     if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
         return 1;
     }
+
     llama_sampling_params & sparams = params.sparams;
 
 #ifndef LOG_DISABLE_LOGS
@@ -180,9 +182,6 @@ int main(int argc, char ** argv) {
     LOG_TEE("%s: seed  = %u\n", __func__, params.seed);
 
     std::mt19937 rng(params.seed);
-    if (params.random_prompt) {
-        params.prompt = string_random_prompt(rng);
-    }
 
     LOG("%s: llama backend init\n", __func__);
     llama_backend_init();
@@ -250,11 +249,8 @@ int main(int argc, char ** argv) {
 
     std::vector<llama_token> embd_inp;
 
-    if (params.interactive_first || params.instruct || params.chatml || !params.prompt.empty() || session_tokens.empty()) {
+    if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
         LOG("tokenize the prompt\n");
-        if (params.chatml) {
-            params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>";
-        }
         embd_inp = ::llama_tokenize(ctx, params.prompt, true, true);
     } else {
         LOG("use session tokens\n");
@@ -332,37 +328,13 @@ int main(int argc, char ** argv) {
     }
 
     // number of tokens to keep when resetting context
-    if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct || params.chatml) {
+    if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size()) {
         params.n_keep = (int)embd_inp.size();
     } else {
         params.n_keep += add_bos; // always keep the BOS token
     }
 
-    // prefix & suffix for instruct mode
-    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true,  true);
-    const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n",    false, true);
-
-    LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
-    LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
-
-    // chatml prefix & suffix
-    const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", true, true);
-    const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true);
-
-    LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str());
-    LOG("cml_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_sfx).c_str());
-
-    // in instruct mode, we inject a prefix and a suffix to each input by the user
-    if (params.instruct) {
-        params.interactive_first = true;
-        params.antiprompt.emplace_back("### Instruction:\n\n");
-    }
-    // similar for chatml mode
-    else if (params.chatml) {
-        params.interactive_first = true;
-        params.antiprompt.emplace_back("<|im_start|>user\n");
-    }
-    else if (params.conversation) {
+    if (params.conversation) {
         params.interactive_first = true;
     }
 
@@ -823,15 +795,13 @@ int main(int argc, char ** argv) {
 
                     is_interacting = true;
                     printf("\n");
-                } else if (params.instruct || params.chatml) {
-                    is_interacting = true;
                 }
             }
 
             if (n_past > 0 && is_interacting) {
                 LOG("waiting for user input\n");
 
-                if (params.conversation || params.instruct || params.chatml) {
+                if (params.conversation) {
                     printf("\n> ");
                 }
 
@@ -874,24 +844,12 @@ int main(int argc, char ** argv) {
 
                     const size_t original_size = embd_inp.size();
 
-                    // instruct mode: insert instruction prefix
-                    if (params.instruct && !is_antiprompt) {
-                        LOG("inserting instruction prefix\n");
-                        n_consumed = embd_inp.size();
-                        embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
-                    }
-                    // chatml mode: insert user chat prefix
-                    if (params.chatml && !is_antiprompt) {
-                        LOG("inserting chatml prefix\n");
-                        n_consumed = embd_inp.size();
-                        embd_inp.insert(embd_inp.end(), cml_pfx.begin(), cml_pfx.end());
-                    }
                     if (params.escape) {
                         string_process_escapes(buffer);
                     }
 
                     const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
-                    const auto line_inp = ::llama_tokenize(ctx, buffer,              false, params.interactive_specials);
+                    const auto line_inp = ::llama_tokenize(ctx, buffer,              false, false);
                     const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
 
                     LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
@@ -900,17 +858,6 @@ int main(int argc, char ** argv) {
                     embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
                     embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end());
 
-                    // instruct mode: insert response suffix
-                    if (params.instruct) {
-                        LOG("inserting instruction suffix\n");
-                        embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
-                    }
-                    // chatml mode: insert assistant chat suffix
-                    if (params.chatml) {
-                        LOG("inserting chatml suffix\n");
-                        embd_inp.insert(embd_inp.end(), cml_sfx.begin(), cml_sfx.end());
-                    }
-
                     for (size_t i = original_size; i < embd_inp.size(); ++i) {
                         const llama_token token = embd_inp[i];
                         output_tokens.push_back(token);
@@ -935,7 +882,7 @@ int main(int argc, char ** argv) {
         }
 
         // end of generation
-        if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.instruct || params.interactive || params.chatml)) {
+        if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) {
             LOG_TEE(" [end of text]\n");
             break;
         }
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index c731abb72..7faeaec97 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -100,7 +100,8 @@ int main(int argc, char ** argv) {
 
     gpt_params params;
 
-    if (gpt_params_parse(argc, argv, params) == false) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
         return 1;
     }
 
diff --git a/examples/passkey/README.md b/examples/passkey/README.md
index 4a22bb559..9e7a119ba 100644
--- a/examples/passkey/README.md
+++ b/examples/passkey/README.md
@@ -8,5 +8,5 @@ See the following PRs for more info:
 ### Usage
 
 ```bash
-make -j && ./passkey ./models/llama-7b-v2/ggml-model-f16.gguf 250
+make -j && ./passkey -m ./models/llama-7b-v2/ggml-model-f16.gguf --junk 250
 ```
diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
index f2ef9ca10..d03215cd1 100644
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -6,46 +6,32 @@
 #include <string>
 #include <vector>
 
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+    gpt_params_print_usage(argc, argv, params);
+
+    LOG_TEE("\nexample usage:\n");
+    LOG_TEE("\n    %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
+    LOG_TEE("\n");
+}
+
 int main(int argc, char ** argv) {
     gpt_params params;
 
-    if (argc == 1 || argv[1][0] == '-') {
-        printf("usage: %s MODEL_PATH N_JUNK N_GRP I_POS SEED\n" , argv[0]);
-        return 1 ;
+    params.n_junk = 250;
+    params.n_keep = 32;
+    params.i_pos  = -1;
+
+    if (!gpt_params_parse(argc, argv, params)) {
+        print_usage(argc, argv, params);
+        return 1;
     }
 
-    int seed = -1;
+    srand(params.seed == LLAMA_DEFAULT_SEED ? time(NULL) : params.seed);
 
-    int n_junk = 250; // number of times to repeat the junk text
-    int n_keep = 32;  // number of tokens in the prompt prefix
-    int n_grp  = 1;   // if more than 1 - perform LongLM SelfExtend
-    int i_pos  = -1;  // position of the passkey in the junk text
-
-    if (argc >= 2) {
-        params.model = argv[1];
-    }
-
-    if (argc >= 3) {
-        n_junk = std::stoi(argv[2]);
-    }
-
-    if (argc >= 4) {
-        n_grp = std::stoi(argv[3]);
-    }
-
-    if (argc >= 5) {
-        i_pos = std::stoi(argv[4]);
-    }
-
-    if (argc >= 6) {
-        seed = std::stoi(argv[5]);
-    }
-
-    if (seed == -1) {
-        seed = time(NULL);
-    }
-
-    srand(seed);
+    int n_junk = params.n_junk;
+    int n_keep = params.n_keep;
+    int n_grp  = params.grp_attn_n;
+    int i_pos  = params.i_pos;
 
     if (i_pos == -1) {
         i_pos = rand() % n_junk;
@@ -76,9 +62,7 @@ int main(int argc, char ** argv) {
 
     // initialize the model
 
-    llama_model_params model_params = llama_model_default_params();
-
-    model_params.n_gpu_layers = 99; // offload all layers to the GPU
+    llama_model_params model_params = llama_model_params_from_gpt_params(params);
 
     llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
 
@@ -89,13 +73,9 @@ int main(int argc, char ** argv) {
 
     // initialize the context
 
-    llama_context_params ctx_params = llama_context_default_params();
+    llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
 
-    ctx_params.seed    = seed;
-    ctx_params.n_ctx   = llama_n_ctx_train(model)*n_grp + n_keep;
-    ctx_params.n_batch = 512;
-    ctx_params.n_threads       = params.n_threads;
-    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+    ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep;
 
     GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp");
 
@@ -135,7 +115,7 @@ int main(int argc, char ** argv) {
     LOG_TEE("prompt tokens: %d\n", n_tokens_all);
     //LOG_TEE("prompt: %s\n", params.prompt.c_str());
 
-    llama_batch batch = llama_batch_init(512, 0, 1);
+    llama_batch batch = llama_batch_init(params.n_batch, 0, 1);
 
     int n_past = 0;
 
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 30e5e282e..0bd78c21a 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -1032,7 +1032,7 @@ struct winogrande_entry {
     std::vector<llama_token> seq_tokens[2];
 };
 
-static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string& prompt) {
+static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string & prompt) {
     std::vector<winogrande_entry> result;
     std::istringstream in(prompt);
     std::string line;
@@ -1964,12 +1964,14 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
 int main(int argc, char ** argv) {
     gpt_params params;
 
+    params.n_ctx = 512;
+    params.logits_all = true;
+
     if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
         return 1;
     }
 
-    params.logits_all = true;
-
     const int32_t n_ctx = params.n_ctx;
 
     if (n_ctx <= 0) {
@@ -2006,9 +2008,6 @@ int main(int argc, char ** argv) {
     fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
 
     std::mt19937 rng(params.seed);
-    if (params.random_prompt) {
-        params.prompt = string_random_prompt(rng);
-    }
 
     llama_backend_init();
     llama_numa_init(params.numa);
@@ -2027,6 +2026,7 @@ int main(int argc, char ** argv) {
     }
 
     const int n_ctx_train = llama_n_ctx_train(model);
+
     if (params.n_ctx > n_ctx_train) {
         fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
                 __func__, n_ctx_train, params.n_ctx);
diff --git a/examples/quantize/tests.sh b/examples/quantize/tests.sh
index a3ca74c68..38e28ffc3 100644
--- a/examples/quantize/tests.sh
+++ b/examples/quantize/tests.sh
@@ -47,7 +47,7 @@ echo PASS
 echo
 
 # 3a. Test the requanted model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --random-prompt --n-predict 32
+$MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --n-predict 32
 echo PASS
 echo
 
@@ -57,7 +57,7 @@ echo PASS
 echo
 
 # 4b. Test the requanted model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-requant-merge.gguf --random-prompt --n-predict 32
+$MAIN --model $WORK_PATH/ggml-model-requant-merge.gguf --n-predict 32
 echo PASS
 echo
 
diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp
index 4e7530706..55b7b2f70 100644
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@@ -4,72 +4,12 @@
 #include <algorithm>
 #include <fstream>
 
-struct retrieval_params {
-    std::vector<std::string> context_files; // context files to embed
-    int32_t chunk_size            = 64;     // chunk size for context embedding
-    std::string chunk_separator   = "\n";   // chunk separator for context embedding
-};
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+    gpt_params_print_usage(argc, argv, params);
 
-static void retrieval_params_print_usage(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & params) {
-    gpt_params_print_usage(argc, argv, gpt_params);
-    printf("retrieval options:\n");
-    printf("  --context-file FNAME  file containing context to embed.\n");
-    printf("                        specify multiple files by providing --context-file option multiple times.\n");
-    printf("  --chunk-size N        minimum length of embedded text chunk (default:%d)\n", params.chunk_size);
-    printf("  --chunk-separator STRING\n");
-    printf("                        string to separate chunks (default: \"\\n\")\n");
-    printf("\n");
-}
-
-static void retrieval_params_parse(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & retrieval_params) {
-    int i = 1;
-    std::string arg;
-    while (i < argc) {
-        arg = argv[i];
-        bool invalid_gpt_param = false;
-        if(gpt_params_find_arg(argc, argv, argv[i], gpt_params, i, invalid_gpt_param)) {
-            if (invalid_gpt_param) {
-                fprintf(stderr, "error: invalid argument: %s\n", arg.c_str());
-                retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
-                exit(1);
-            }
-            // option was parsed by gpt_params_find_arg
-        } else if (arg == "--context-file") {
-            if (++i >= argc) {
-                fprintf(stderr, "error: missing argument for --context-file\n");
-                retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
-                exit(1);
-            }
-            std::ifstream file(argv[i]);
-            if (!file) {
-                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
-                retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
-                exit(1);
-            }
-            // store the external file name in params
-            retrieval_params.context_files.push_back(argv[i]);
-        } else if (arg == "--chunk-size") {
-            if (++i >= argc) {
-                fprintf(stderr, "error: missing argument for --chunk-size\n");
-                retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
-                exit(1);
-            }
-            retrieval_params.chunk_size = std::stoi(argv[i]);
-        } else if (arg == "--chunk-separator") {
-            if (++i >= argc) {
-                fprintf(stderr, "error: missing argument for --chunk-separator\n");
-                retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
-                exit(1);
-            }
-            retrieval_params.chunk_separator = argv[i];
-        } else {
-            // unknown argument
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params);
-            exit(1);
-        }
-        i++;
-    }
+    LOG_TEE("\nexample usage:\n");
+    LOG_TEE("\n    %s --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator .\n", argv[0]);
+    LOG_TEE("\n");
 }
 
 struct chunk {
@@ -171,33 +111,35 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
 
 int main(int argc, char ** argv) {
     gpt_params params;
-    retrieval_params retrieval_params;
 
-    retrieval_params_parse(argc, argv, params, retrieval_params);
+    if (!gpt_params_parse(argc, argv, params)) {
+        print_usage(argc, argv, params);
+        return 1;
+    }
 
     // For BERT models, batch size must be equal to ubatch size
     params.n_ubatch = params.n_batch;
+    params.embedding = true;
 
-    if (retrieval_params.chunk_size <= 0) {
+    if (params.chunk_size <= 0) {
         fprintf(stderr, "chunk_size must be positive\n");
         return 1;
     }
-    if (retrieval_params.context_files.empty()) {
+    if (params.context_files.empty()) {
         fprintf(stderr, "context_files must be specified\n");
         return 1;
     }
-    params.embedding = true;
 
     print_build_info();
 
     printf("processing files:\n");
-    for (auto & context_file : retrieval_params.context_files) {
+    for (auto & context_file : params.context_files) {
         printf("%s\n", context_file.c_str());
     }
 
     std::vector<chunk> chunks;
-    for (auto & context_file : retrieval_params.context_files) {
-        std::vector<chunk> file_chunk = chunk_file(context_file, retrieval_params.chunk_size, retrieval_params.chunk_separator);
+    for (auto & context_file : params.context_files) {
+        std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator);
         chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
     }
     printf("Number of chunks: %ld\n", chunks.size());
@@ -242,7 +184,7 @@ int main(int argc, char ** argv) {
             return 1;
         }
         // add eos if not present
-        if (inp.empty() || inp.back() != llama_token_eos(model)) {
+        if (llama_token_eos(model) >= 0 && (inp.empty() || inp.back() != llama_token_eos(model))) {
             inp.push_back(llama_token_eos(model));
         }
         chunk.tokens = inp;
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index c3b766882..00c2277ac 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -11,6 +11,7 @@ int main(int argc, char ** argv) {
     params.prompt = "The quick brown fox";
 
     if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
         return 1;
     }
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index fc6d90848..d581cad95 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -123,29 +123,6 @@ struct slot_params {
     json input_suffix;
 };
 
-struct server_params {
-    int32_t port           = 8080;
-    int32_t read_timeout   = 600;
-    int32_t write_timeout  = 600;
-    int32_t n_threads_http = -1;
-
-    std::string hostname      = "127.0.0.1";
-    std::string public_path   = "";
-    std::string chat_template = "";
-    std::string system_prompt = "";
-
-    std::vector<std::string> api_keys;
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-    std::string ssl_key_file = "";
-    std::string ssl_cert_file = "";
-#endif
-
-    bool slots_endpoint   = true;
-    bool metrics_endpoint = false;
-    std::string slot_save_path;
-};
-
 struct server_slot {
     int id;
     int id_task = -1;
@@ -1261,7 +1238,7 @@ struct server_context {
     }
 
     json get_formated_generation(const server_slot & slot) const {
-        const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
+        const auto eos_bias   =             slot.sparams.logit_bias.find(llama_token_eos(model));
         const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && eos_bias->second < 0.0f && std::isinf(eos_bias->second);
 
         std::vector<std::string> samplers_sequence;
@@ -2334,561 +2311,6 @@ struct server_context {
     }
 };
 
-static void server_print_usage(const char * argv0, const gpt_params & params, const server_params & sparams) {
-    printf("usage: %s [options]\n", argv0);
-    printf("\n");
-    printf("options:\n");
-    printf("  -h, --help                show this help message and exit\n");
-    printf("  -v, --verbose             verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
-    printf("  -t N, --threads N         number of threads to use during computation (default: %d)\n", params.n_threads);
-    printf("  -tb N, --threads-batch N  number of threads to use during batch and prompt processing (default: same as --threads)\n");
-    printf("  --threads-http N          number of threads in the http server pool to process requests (default: max(hardware concurrency - 1, --parallel N + 2))\n");
-    printf("  -c N, --ctx-size N        size of the prompt context (default: %d)\n", params.n_ctx);
-    printf("  --rope-scaling {none,linear,yarn}\n");
-    printf("                            RoPE frequency scaling method, defaults to linear unless specified by the model\n");
-    printf("  --rope-freq-base N        RoPE base frequency (default: loaded from model)\n");
-    printf("  --rope-freq-scale N       RoPE frequency scaling factor, expands context by a factor of 1/N\n");
-    printf("  --yarn-ext-factor N       YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n");
-    printf("  --yarn-attn-factor N      YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
-    printf("  --yarn-beta-slow N        YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
-    printf("  --yarn-beta-fast N        YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
-    printf("  --pooling {none,mean,cls} pooling type for embeddings, use model default if unspecified\n");
-    printf("  -dt N, --defrag-thold N\n");
-    printf("                            KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold);
-    printf("  -b N, --batch-size N      logical maximum batch size (default: %d)\n", params.n_batch);
-    printf("  -ub N, --ubatch-size N    physical maximum batch size (default: %d)\n", params.n_ubatch);
-    if (llama_supports_mlock()) {
-        printf("  --mlock                   force system to keep model in RAM rather than swapping or compressing\n");
-    }
-    if (llama_supports_mmap()) {
-        printf("  --no-mmap                 do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
-    }
-    printf("  --numa TYPE               attempt optimizations that help on some NUMA systems\n");
-    printf("                              - distribute: spread execution evenly over all nodes\n");
-    printf("                              - isolate: only spawn threads on CPUs on the node that execution started on\n");
-    printf("                              - numactl: use the CPU map provided my numactl\n");
-    if (llama_supports_gpu_offload()) {
-        printf("  -ngl N, --n-gpu-layers N\n");
-        printf("                            number of layers to store in VRAM\n");
-        printf("  -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
-        printf("                            how to split the model across multiple GPUs, one of:\n");
-        printf("                              - none: use one GPU only\n");
-        printf("                              - layer (default): split layers and KV across GPUs\n");
-        printf("                              - row: split rows across GPUs\n");
-        printf("  -ts SPLIT --tensor-split SPLIT\n");
-        printf("                            fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
-        printf("  -mg i, --main-gpu i       the GPU to use for the model (with split-mode = none),\n");
-        printf("                            or for intermediate results and KV (with split-mode = row)\n");
-        printf("  -nkvo, --no-kv-offload\n");
-        printf("                            disable KV offload\n");
-    }
-    printf("  -m FNAME, --model FNAME\n");
-    printf("                            model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise %s)\n", DEFAULT_MODEL_PATH);
-    printf("  -mu MODEL_URL, --model-url MODEL_URL\n");
-    printf("                            model download url (default: unused)\n");
-    printf("  -hfr REPO, --hf-repo REPO\n");
-    printf("                            Hugging Face model repository (default: unused)\n");
-    printf("  -hff FILE, --hf-file FILE\n");
-    printf("                            Hugging Face model file (default: unused)\n");
-    printf("  -a ALIAS, --alias ALIAS\n");
-    printf("                            set an alias for the model, will be added as `model` field in completion response\n");
-    printf("  --lora FNAME              apply LoRA adapter (implies --no-mmap)\n");
-    printf("  --lora-base FNAME         optional model to use as a base for the layers modified by the LoRA adapter\n");
-    printf("  --host                    ip address to listen (default  (default: %s)\n", sparams.hostname.c_str());
-    printf("  --port PORT               port to listen (default  (default: %d)\n", sparams.port);
-    printf("  --rpc SERVERS             comma separated list of RPC servers\n");
-    printf("  --path PUBLIC_PATH        path from which to serve static files (default: disabled)\n");
-    printf("  --api-key API_KEY         optional api key to enhance server security. If set, requests must include this key for access.\n");
-    printf("  --api-key-file FNAME      path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n");
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-    printf("  --ssl-key-file FNAME      path to file a PEM-encoded SSL private key\n");
-    printf("  --ssl-cert-file FNAME     path to file a PEM-encoded SSL certificate\n");
-#endif
-    printf("  -to N, --timeout N        server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
-    printf("  --embeddings              enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
-    printf("  -np N, --parallel N       number of slots for process requests (default: %d)\n", params.n_parallel);
-    printf("  -cb, --cont-batching      enable continuous batching (a.k.a dynamic batching) (default: enabled)\n");
-    printf("  -fa, --flash-attn         enable Flash Attention (default: %s)\n", params.flash_attn ? "enabled" : "disabled");
-    printf("  -spf FNAME, --system-prompt-file FNAME\n");
-    printf("                            set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
-    printf("  -ctk TYPE, --cache-type-k TYPE\n");
-    printf("                            KV cache data type for K (default: f16)\n");
-    printf("  -ctv TYPE, --cache-type-v TYPE\n");
-    printf("                            KV cache data type for V (default: f16)\n");
-    printf("  --log-format              log output format: json or text (default: json)\n");
-    printf("  --log-disable             disables logging to a file.\n");
-    printf("  --slots-endpoint-disable  disables slots monitoring endpoint.\n");
-    printf("  --metrics                 enable prometheus compatible metrics endpoint (default: %s).\n", sparams.metrics_endpoint ? "enabled" : "disabled");
-    printf("  --slot-save-path PATH     path to save slot kv cache (default: disabled)\n");
-    printf("\n");
-    printf("  -n, --n-predict           maximum tokens to predict (default: %d)\n", params.n_predict);
-    printf("  --override-kv KEY=TYPE:VALUE\n");
-    printf("                            advanced option to override model metadata by key. may be specified multiple times.\n");
-    printf("                            types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
-    printf("  -gan N, --grp-attn-n N    set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`\n");
-    printf("  -gaw N, --grp-attn-w N    set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`\n");
-    printf("  --chat-template JINJA_TEMPLATE\n");
-    printf("                            set custom jinja chat template (default: template taken from model's metadata)\n");
-    printf("                            only commonly used templates are accepted:\n");
-    printf("                            https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template\n");
-    printf("\n");
-}
-
-static void server_params_parse(int argc, char ** argv, server_params & sparams, gpt_params & params) {
-    gpt_params    default_params;
-    server_params default_sparams;
-
-    std::string arg;
-    bool invalid_param = false;
-
-    for (int i = 1; i < argc; i++) {
-        arg = argv[i];
-        if (arg == "--port") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            sparams.port = std::stoi(argv[i]);
-        } else if (arg == "--rpc") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.rpc_servers = argv[i];
-        } else if (arg == "--host") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            sparams.hostname = argv[i];
-        } else if (arg == "--path") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            sparams.public_path = argv[i];
-        } else if (arg == "--api-key") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            sparams.api_keys.push_back(argv[i]);
-        } else if (arg == "--api-key-file") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            std::ifstream key_file(argv[i]);
-            if (!key_file) {
-                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
-                invalid_param = true;
-                break;
-            }
-            std::string key;
-            while (std::getline(key_file, key)) {
-               if (key.size() > 0) {
-                   sparams.api_keys.push_back(key);
-               }
-            }
-            key_file.close();
-
-        }
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-        else if (arg == "--ssl-key-file") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            sparams.ssl_key_file = argv[i];
-        } else if (arg == "--ssl-cert-file") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            sparams.ssl_cert_file = argv[i];
-        }
-#endif
-        else if (arg == "--timeout" || arg == "-to") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            sparams.read_timeout = std::stoi(argv[i]);
-            sparams.write_timeout = std::stoi(argv[i]);
-        } else if (arg == "-m" || arg == "--model") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.model = argv[i];
-        } else if (arg == "-mu" || arg == "--model-url") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.model_url = argv[i];
-        } else if (arg == "-hfr" || arg == "--hf-repo") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.hf_repo = argv[i];
-        } else if (arg == "-hff" || arg == "--hf-file") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.hf_file = argv[i];
-        } else if (arg == "-a" || arg == "--alias") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.model_alias = argv[i];
-        } else if (arg == "-h" || arg == "--help") {
-            server_print_usage(argv[0], default_params, default_sparams);
-            exit(0);
-        } else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_ctx = std::stoi(argv[i]);
-        } else if (arg == "--rope-scaling") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            std::string value(argv[i]);
-            /**/ if (value == "none")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; }
-            else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; }
-            else if (value == "yarn")   { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; }
-            else { invalid_param = true; break; }
-        } else if (arg == "--rope-freq-base") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.rope_freq_base = std::stof(argv[i]);
-        } else if (arg == "--rope-freq-scale") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.rope_freq_scale = std::stof(argv[i]);
-        } else if (arg == "--yarn-ext-factor") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.yarn_ext_factor = std::stof(argv[i]);
-        }
-        else if (arg == "--yarn-attn-factor") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.yarn_attn_factor = std::stof(argv[i]);
-        } else if (arg == "--yarn-beta-fast") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.yarn_beta_fast = std::stof(argv[i]);
-        } else if (arg == "--yarn-beta-slow") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.yarn_beta_slow = std::stof(argv[i]);
-        } else if (arg == "--pooling") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            std::string value(argv[i]);
-            /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
-            else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
-            else if (value == "cls")  { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
-            else { invalid_param = true; break; }
-        } else if (arg == "--defrag-thold" || arg == "-dt") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.defrag_thold = std::stof(argv[i]);
-        } else if (arg == "--threads" || arg == "-t") {
-            if (++i >= argc)
-            {
-                invalid_param = true;
-                break;
-            }
-            params.n_threads = std::stoi(argv[i]);
-        } else if (arg == "--grp-attn-n" || arg == "-gan") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-
-            params.grp_attn_n = std::stoi(argv[i]);
-        } else if (arg == "--grp-attn-w" || arg == "-gaw") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-
-            params.grp_attn_w = std::stoi(argv[i]);
-        } else if (arg == "--threads-batch" || arg == "-tb") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_threads_batch = std::stoi(argv[i]);
-        } else if (arg == "--threads-http") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            sparams.n_threads_http = std::stoi(argv[i]);
-        } else if (arg == "-b" || arg == "--batch-size") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_batch = std::stoi(argv[i]);
-        } else if (arg == "-ub" || arg == "--ubatch-size") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_ubatch = std::stoi(argv[i]);
-        } else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            if (llama_supports_gpu_offload()) {
-                params.n_gpu_layers = std::stoi(argv[i]);
-            } else {
-                LOG_WARNING(
-                    "Not compiled with GPU offload support, --n-gpu-layers option will be ignored. "
-                    "See main README.md for information on enabling GPU BLAS support",
-                    {{"n_gpu_layers", params.n_gpu_layers}});
-            }
-        } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
-            params.no_kv_offload = true;
-        } else if (arg == "--split-mode" || arg == "-sm") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            std::string arg_next = argv[i];
-            if (arg_next == "none") {
-                params.split_mode = LLAMA_SPLIT_MODE_NONE;
-            } else if (arg_next == "layer") {
-                params.split_mode = LLAMA_SPLIT_MODE_LAYER;
-            } else if (arg_next == "row") {
-                params.split_mode = LLAMA_SPLIT_MODE_ROW;
-            } else {
-                invalid_param = true;
-                break;
-            }
-#ifndef GGML_USE_CUDA
-            fprintf(stderr, "warning: llama.cpp was compiled without CUDA. Setting the split mode has no effect.\n");
-#endif // GGML_USE_CUDA
-        } else if (arg == "--tensor-split" || arg == "-ts") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
-            std::string arg_next = argv[i];
-
-            // split string by , and /
-            const std::regex regex{R"([,/]+)"};
-            std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
-            std::vector<std::string> split_arg{it, {}};
-            GGML_ASSERT(split_arg.size() <= llama_max_devices());
-
-            for (size_t i_device = 0; i_device < llama_max_devices(); ++i_device) {
-                if (i_device < split_arg.size()) {
-                    params.tensor_split[i_device] = std::stof(split_arg[i_device]);
-                } else {
-                    params.tensor_split[i_device] = 0.0f;
-                }
-            }
-#else
-            LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a tensor split.\n", {});
-#endif // GGML_USE_CUDA
-        } else if (arg == "--main-gpu" || arg == "-mg") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-#if defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)
-            params.main_gpu = std::stoi(argv[i]);
-#else
-            LOG_WARNING("llama.cpp was compiled without CUDA. It is not possible to set a main GPU.", {});
-#endif
-        } else if (arg == "--lora") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.lora_adapter.emplace_back(argv[i], 1.0f);
-            params.use_mmap = false;
-        } else if (arg == "--lora-scaled") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            const char * lora_adapter = argv[i];
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
-            params.use_mmap = false;
-        } else if (arg == "--lora-base") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.lora_base = argv[i];
-        } else if (arg == "-v" || arg == "--verbose") {
-#if SERVER_VERBOSE != 1
-            LOG_WARNING("server.cpp is not built with verbose logging.", {});
-#else
-            server_verbose = true;
-#endif
-        } else if (arg == "--mlock") {
-            params.use_mlock = true;
-        } else if (arg == "--no-mmap") {
-            params.use_mmap = false;
-        } else if (arg == "--numa") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            } else {
-                std::string value(argv[i]);
-                /**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
-                else if (value == "isolate")                    { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
-                else if (value == "numactl")                    { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
-                else { invalid_param = true; break; }
-            }
-        } else if (arg == "--embedding" || arg == "--embeddings") {
-            params.embedding = true;
-        } else if (arg == "-cb" || arg == "--cont-batching") {
-            params.cont_batching = true;
-        } else if (arg == "-fa" || arg == "--flash-attn") {
-            params.flash_attn = true;
-        } else if (arg == "-np" || arg == "--parallel") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_parallel = std::stoi(argv[i]);
-        } else if (arg == "-n" || arg == "--n-predict") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            params.n_predict = std::stoi(argv[i]);
-        } else if (arg == "-spf" || arg == "--system-prompt-file") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            std::ifstream file(argv[i]);
-            if (!file) {
-                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
-                invalid_param = true;
-                break;
-            }
-            std::string system_prompt;
-            std::copy(
-                std::istreambuf_iterator<char>(file),
-                std::istreambuf_iterator<char>(),
-                std::back_inserter(system_prompt)
-            );
-            sparams.system_prompt = system_prompt;
-        } else if (arg == "-ctk" || arg == "--cache-type-k") {
-            params.cache_type_k = argv[++i];
-        } else if (arg == "-ctv" || arg == "--cache-type-v") {
-            params.cache_type_v = argv[++i];
-        } else if (arg == "--log-format") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            if (std::strcmp(argv[i], "json") == 0) {
-                server_log_json = true;
-            } else if (std::strcmp(argv[i], "text") == 0) {
-                server_log_json = false;
-            } else {
-                invalid_param = true;
-                break;
-            }
-        } else if (arg == "--log-disable") {
-            log_set_target(stdout);
-            LOG_INFO("logging to file is disabled.", {});
-        } else if (arg == "--slots-endpoint-disable") {
-            sparams.slots_endpoint = false;
-        } else if (arg == "--metrics") {
-            sparams.metrics_endpoint = true;
-        } else if (arg == "--slot-save-path") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            sparams.slot_save_path = argv[i];
-            // if doesn't end with DIRECTORY_SEPARATOR, add it
-            if (!sparams.slot_save_path.empty() && sparams.slot_save_path[sparams.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
-                sparams.slot_save_path += DIRECTORY_SEPARATOR;
-            }
-        } else if (arg == "--chat-template") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            if (!verify_custom_template(argv[i])) {
-                fprintf(stderr, "error: the supplied chat template is not supported: %s\n", argv[i]);
-                fprintf(stderr, "note: llama.cpp does not use jinja parser, we only support commonly used templates\n");
-                invalid_param = true;
-                break;
-            }
-            sparams.chat_template = argv[i];
-        } else if (arg == "--override-kv") {
-            if (++i >= argc) {
-                invalid_param = true;
-                break;
-            }
-            if (!string_parse_kv_override(argv[i], params.kv_overrides)) {
-                fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
-                invalid_param = true;
-                break;
-            }
-        } else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            server_print_usage(argv[0], default_params, default_sparams);
-            exit(1);
-        }
-    }
-
-    gpt_params_handle_model_default(params);
-
-    if (!params.kv_overrides.empty()) {
-        params.kv_overrides.emplace_back();
-        params.kv_overrides.back().key[0] = 0;
-    }
-
-    if (invalid_param) {
-        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
-        server_print_usage(argv[0], default_params, default_sparams);
-        exit(1);
-    }
-}
-
 static void log_server_request(const httplib::Request & req, const httplib::Response & res) {
     // skip GH copilot requests when using default port
     if (req.path == "/v1/health" || req.path == "/v1/completions") {
@@ -2929,16 +2351,22 @@ int main(int argc, char ** argv) {
     log_disable();
 #endif
     // own arguments required by this example
-    gpt_params    params;
-    server_params sparams;
+    gpt_params params;
+
+    if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
+        return 1;
+    }
+
+    // TODO: not great to use extern vars
+    server_log_json = params.log_json;
+    server_verbose = params.verbose;
 
     // struct that contains llama context and inference
     server_context ctx_server;
 
-    server_params_parse(argc, argv, sparams, params);
-
-    if (!sparams.system_prompt.empty()) {
-        ctx_server.system_prompt_set(sparams.system_prompt);
+    if (!params.system_prompt.empty()) {
+        ctx_server.system_prompt_set(params.system_prompt);
     }
 
     if (params.model_alias == "unknown") {
@@ -2962,10 +2390,10 @@ int main(int argc, char ** argv) {
 
     std::unique_ptr<httplib::Server> svr;
 #ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-    if (sparams.ssl_key_file != "" && sparams.ssl_cert_file != "") {
-        LOG_INFO("Running with SSL", {{"key", sparams.ssl_key_file}, {"cert", sparams.ssl_cert_file}});
+    if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
+        LOG_INFO("Running with SSL", {{"key", params.ssl_file_key}, {"cert", params.ssl_file_cert}});
         svr.reset(
-            new httplib::SSLServer(sparams.ssl_cert_file.c_str(), sparams.ssl_key_file.c_str())
+            new httplib::SSLServer(params.ssl_file_cert.c_str(), params.ssl_file_key.c_str())
         );
     } else {
         LOG_INFO("Running without SSL", {});
@@ -3019,24 +2447,24 @@ int main(int argc, char ** argv) {
     });
 
     // set timeouts and change hostname and port
-    svr->set_read_timeout (sparams.read_timeout);
-    svr->set_write_timeout(sparams.write_timeout);
+    svr->set_read_timeout (params.timeout_read);
+    svr->set_write_timeout(params.timeout_write);
 
-    if (!svr->bind_to_port(sparams.hostname, sparams.port)) {
-        fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", sparams.hostname.c_str(), sparams.port);
+    if (!svr->bind_to_port(params.hostname, params.port)) {
+        fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", params.hostname.c_str(), params.port);
         return 1;
     }
 
     std::unordered_map<std::string, std::string> log_data;
 
-    log_data["hostname"] = sparams.hostname;
-    log_data["port"]     = std::to_string(sparams.port);
+    log_data["hostname"] = params.hostname;
+    log_data["port"]     = std::to_string(params.port);
 
-    if (sparams.api_keys.size() == 1) {
-        auto key = sparams.api_keys[0];
+    if (params.api_keys.size() == 1) {
+        auto key = params.api_keys[0];
         log_data["api_key"] = "api_key: ****" + key.substr(std::max((int)(key.length() - 4), 0));
-    } else if (sparams.api_keys.size() > 1) {
-        log_data["api_key"] = "api_key: " + std::to_string(sparams.api_keys.size()) + " keys loaded";
+    } else if (params.api_keys.size() > 1) {
+        log_data["api_key"] = "api_key: " + std::to_string(params.api_keys.size()) + " keys loaded";
     }
 
     // load the model
@@ -3053,10 +2481,10 @@ int main(int argc, char ** argv) {
     const auto model_meta = ctx_server.model_meta();
 
     // if a custom chat template is not supplied, we will use the one that comes with the model (if any)
-    if (sparams.chat_template.empty()) {
+    if (params.chat_template.empty()) {
         if (!ctx_server.validate_model_chat_template()) {
             LOG_ERROR("The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses", {});
-            sparams.chat_template = "chatml";
+            params.chat_template = "chatml";
         }
     }
 
@@ -3068,11 +2496,11 @@ int main(int argc, char ** argv) {
         chat.push_back({{"role", "assistant"}, {"content", "Hi there"}});
         chat.push_back({{"role", "user"},      {"content", "How are you?"}});
 
-        const std::string chat_example = format_chat(ctx_server.model, sparams.chat_template, chat);
+        const std::string chat_example = format_chat(ctx_server.model, params.chat_template, chat);
 
         LOG_INFO("chat template", {
             {"chat_example", chat_example},
-            {"built_in", sparams.chat_template.empty()},
+            {"built_in", params.chat_template.empty()},
         });
     }
 
@@ -3080,7 +2508,7 @@ int main(int argc, char ** argv) {
     // Middlewares
     //
 
-    auto middleware_validate_api_key = [&sparams, &res_error](const httplib::Request & req, httplib::Response & res) {
+    auto middleware_validate_api_key = [&params, &res_error](const httplib::Request & req, httplib::Response & res) {
         // TODO: should we apply API key to all endpoints, including "/health" and "/models"?
         static const std::set<std::string> protected_endpoints = {
             "/props",
@@ -3098,7 +2526,7 @@ int main(int argc, char ** argv) {
         };
 
         // If API key is not set, skip validation
-        if (sparams.api_keys.empty()) {
+        if (params.api_keys.empty()) {
             return true;
         }
 
@@ -3113,7 +2541,7 @@ int main(int argc, char ** argv) {
         std::string prefix = "Bearer ";
         if (auth_header.substr(0, prefix.size()) == prefix) {
             std::string received_api_key = auth_header.substr(prefix.size());
-            if (std::find(sparams.api_keys.begin(), sparams.api_keys.end(), received_api_key) != sparams.api_keys.end()) {
+            if (std::find(params.api_keys.begin(), params.api_keys.end(), received_api_key) != params.api_keys.end()) {
                 return true; // API key is valid
             }
         }
@@ -3168,7 +2596,7 @@ int main(int argc, char ** argv) {
                     };
 
                     res.status = 200; // HTTP OK
-                    if (sparams.slots_endpoint && req.has_param("include_slots")) {
+                    if (params.endpoint_slots && req.has_param("include_slots")) {
                         health["slots"] = result.data.at("slots");
                     }
 
@@ -3194,7 +2622,7 @@ int main(int argc, char ** argv) {
     };
 
     const auto handle_slots = [&](const httplib::Request &, httplib::Response & res) {
-        if (!sparams.slots_endpoint) {
+        if (!params.endpoint_slots) {
             res_error(res, format_error_response("This server does not support slots endpoint.", ERROR_TYPE_NOT_SUPPORTED));
             return;
         }
@@ -3218,7 +2646,7 @@ int main(int argc, char ** argv) {
     };
 
     const auto handle_metrics = [&](const httplib::Request &, httplib::Response & res) {
-        if (!sparams.metrics_endpoint) {
+        if (!params.endpoint_metrics) {
             res_error(res, format_error_response("This server does not support metrics endpoint.", ERROR_TYPE_NOT_SUPPORTED));
             return;
         }
@@ -3318,14 +2746,14 @@ int main(int argc, char ** argv) {
         res.status = 200; // HTTP OK
     };
 
-    const auto handle_slots_save = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
+    const auto handle_slots_save = [&ctx_server, &res_error, &params](const httplib::Request & req, httplib::Response & res, int id_slot) {
         json request_data = json::parse(req.body);
         std::string filename = request_data.at("filename");
         if (!fs_validate_filename(filename)) {
             res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
             return;
         }
-        std::string filepath = sparams.slot_save_path + filename;
+        std::string filepath = params.slot_save_path + filename;
 
         server_task task;
         task.type = SERVER_TASK_TYPE_SLOT_SAVE;
@@ -3348,14 +2776,14 @@ int main(int argc, char ** argv) {
         }
     };
 
-    const auto handle_slots_restore = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
+    const auto handle_slots_restore = [&ctx_server, &res_error, &params](const httplib::Request & req, httplib::Response & res, int id_slot) {
         json request_data = json::parse(req.body);
         std::string filename = request_data.at("filename");
         if (!fs_validate_filename(filename)) {
             res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
             return;
         }
-        std::string filepath = sparams.slot_save_path + filename;
+        std::string filepath = params.slot_save_path + filename;
 
         server_task task;
         task.type = SERVER_TASK_TYPE_SLOT_RESTORE;
@@ -3530,9 +2958,9 @@ int main(int argc, char ** argv) {
         res.set_content(models.dump(), "application/json; charset=utf-8");
     };
 
-    const auto handle_chat_completions = [&ctx_server, &sparams, &res_error](const httplib::Request & req, httplib::Response & res) {
+    const auto handle_chat_completions = [&ctx_server, &params, &res_error](const httplib::Request & req, httplib::Response & res) {
         res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
-        json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), sparams.chat_template);
+        json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
 
         const int id_task = ctx_server.queue_tasks.get_new_id();
 
@@ -3757,29 +3185,29 @@ int main(int argc, char ** argv) {
     //
 
     // register static assets routes
-    if (!sparams.public_path.empty()) {
+    if (!params.public_path.empty()) {
         // Set the base directory for serving static files
-        svr->set_base_dir(sparams.public_path);
+        svr->set_base_dir(params.public_path);
     }
+
     // using embedded static files
-    svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
-    svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
-    svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
-    svr->Get("/json-schema-to-grammar.mjs", handle_static_file(
-      json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8"));
+    svr->Get("/",                           handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
+    svr->Get("/index.js",                   handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
+    svr->Get("/completion.js",              handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
+    svr->Get("/json-schema-to-grammar.mjs", handle_static_file(json_schema_to_grammar_mjs, json_schema_to_grammar_mjs_len, "text/javascript; charset=utf-8"));
 
     // add new-ui files
-    svr->Get("/colorthemes.css", handle_static_file(colorthemes_css, colorthemes_css_len, "text/css; charset=utf-8"));
-    svr->Get("/style.css", handle_static_file(style_css, style_css_len, "text/css; charset=utf-8"));
+    svr->Get("/colorthemes.css",       handle_static_file(colorthemes_css, colorthemes_css_len, "text/css; charset=utf-8"));
+    svr->Get("/style.css",             handle_static_file(style_css, style_css_len, "text/css; charset=utf-8"));
     svr->Get("/theme-beeninorder.css", handle_static_file(theme_beeninorder_css, theme_beeninorder_css_len, "text/css; charset=utf-8"));
-    svr->Get("/theme-ketivah.css", handle_static_file(theme_ketivah_css, theme_ketivah_css_len, "text/css; charset=utf-8"));
-    svr->Get("/theme-mangotango.css", handle_static_file(theme_mangotango_css, theme_mangotango_css_len, "text/css; charset=utf-8"));
-    svr->Get("/theme-playground.css", handle_static_file(theme_playground_css, theme_playground_css_len, "text/css; charset=utf-8"));
-    svr->Get("/theme-polarnight.css", handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8"));
-    svr->Get("/theme-snowstorm.css", handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8"));
-    svr->Get("/index-new.html", handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8"));
-    svr->Get("/system-prompts.js", handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8"));
-    svr->Get("/prompt-formats.js", handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8"));
+    svr->Get("/theme-ketivah.css",     handle_static_file(theme_ketivah_css, theme_ketivah_css_len, "text/css; charset=utf-8"));
+    svr->Get("/theme-mangotango.css",  handle_static_file(theme_mangotango_css, theme_mangotango_css_len, "text/css; charset=utf-8"));
+    svr->Get("/theme-playground.css",  handle_static_file(theme_playground_css, theme_playground_css_len, "text/css; charset=utf-8"));
+    svr->Get("/theme-polarnight.css",  handle_static_file(theme_polarnight_css, theme_polarnight_css_len, "text/css; charset=utf-8"));
+    svr->Get("/theme-snowstorm.css",   handle_static_file(theme_snowstorm_css, theme_snowstorm_css_len, "text/css; charset=utf-8"));
+    svr->Get("/index-new.html",        handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8"));
+    svr->Get("/system-prompts.js",     handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8"));
+    svr->Get("/prompt-formats.js",     handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8"));
 
     // register API routes
     svr->Get ("/health",              handle_health);
@@ -3798,7 +3226,7 @@ int main(int argc, char ** argv) {
     svr->Post("/v1/embeddings",       handle_embeddings);
     svr->Post("/tokenize",            handle_tokenize);
     svr->Post("/detokenize",          handle_detokenize);
-    if (!sparams.slot_save_path.empty()) {
+    if (!params.slot_save_path.empty()) {
         // only enable slot endpoints if slot_save_path is set
         svr->Post("/slots/:id_slot",  handle_slots_action);
     }
@@ -3806,12 +3234,12 @@ int main(int argc, char ** argv) {
     //
     // Start the server
     //
-    if (sparams.n_threads_http < 1) {
+    if (params.n_threads_http < 1) {
         // +2 threads for monitoring endpoints
-        sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
+        params.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
     }
-    log_data["n_threads_http"] =  std::to_string(sparams.n_threads_http);
-    svr->new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); };
+    log_data["n_threads_http"] =  std::to_string(params.n_threads_http);
+    svr->new_task_queue = [&params] { return new httplib::ThreadPool(params.n_threads_http); };
 
     LOG_INFO("HTTP server listening", log_data);
 
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index d8a2286e4..b7bfb41d3 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -116,13 +116,6 @@ static inline void server_log(const char * level, const char * function, int lin
 // chat template utils
 //
 
-// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
-inline bool verify_custom_template(const std::string & tmpl) {
-    llama_chat_message chat[] = {{"user", "test"}};
-    int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
-    return res >= 0;
-}
-
 // Format given chat. If tmpl is empty, we take the template from model metadata
 inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
     size_t alloc_size = 0;
diff --git a/examples/simple/README.md b/examples/simple/README.md
index 5d24b1046..49e24501c 100644
--- a/examples/simple/README.md
+++ b/examples/simple/README.md
@@ -3,7 +3,7 @@
 The purpose of this example is to demonstrate a minimal usage of llama.cpp for generating text with a given prompt.
 
 ```bash
-./simple ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is"
+./simple -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is"
 
 ...
 
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index b0f8e0fdc..69a92cf7d 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -6,28 +6,27 @@
 #include <string>
 #include <vector>
 
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+    gpt_params_print_usage(argc, argv, params);
+
+    LOG_TEE("\nexample usage:\n");
+    LOG_TEE("\n    %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]);
+    LOG_TEE("\n");
+}
+
 int main(int argc, char ** argv) {
     gpt_params params;
 
-    if (argc == 1 || argv[1][0] == '-') {
-        printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]);
-        return 1 ;
-    }
+    params.prompt = "Hello my name is";
+    params.n_predict = 32;
 
-    if (argc >= 2) {
-        params.model = argv[1];
-    }
-
-    if (argc >= 3) {
-        params.prompt = argv[2];
-    }
-
-    if (params.prompt.empty()) {
-        params.prompt = "Hello my name is";
+    if (!gpt_params_parse(argc, argv, params)) {
+        print_usage(argc, argv, params);
+        return 1;
     }
 
     // total length of the sequence including the prompt
-    const int n_len = 32;
+    const int n_predict = params.n_predict;
 
     // init LLM
 
@@ -36,9 +35,7 @@ int main(int argc, char ** argv) {
 
     // initialize the model
 
-    llama_model_params model_params = llama_model_default_params();
-
-    // model_params.n_gpu_layers = 99; // offload all layers to the GPU
+    llama_model_params model_params = llama_model_params_from_gpt_params(params);
 
     llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
 
@@ -49,12 +46,7 @@ int main(int argc, char ** argv) {
 
     // initialize the context
 
-    llama_context_params ctx_params = llama_context_default_params();
-
-    ctx_params.seed  = 1234;
-    ctx_params.n_ctx = 2048;
-    ctx_params.n_threads = params.n_threads;
-    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+    llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
 
     llama_context * ctx = llama_new_context_with_model(model, ctx_params);
 
@@ -69,14 +61,14 @@ int main(int argc, char ** argv) {
     tokens_list = ::llama_tokenize(ctx, params.prompt, true);
 
     const int n_ctx    = llama_n_ctx(ctx);
-    const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
+    const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size());
 
-    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, n_kv_req);
+    LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req);
 
     // make sure the KV cache is big enough to hold all the prompt and generated tokens
     if (n_kv_req > n_ctx) {
         LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
-        LOG_TEE("%s:        either reduce n_len or increase n_ctx\n", __func__);
+        LOG_TEE("%s:        either reduce n_predict or increase n_ctx\n", __func__);
         return 1;
     }
 
@@ -115,7 +107,7 @@ int main(int argc, char ** argv) {
 
     const auto t_main_start = ggml_time_us();
 
-    while (n_cur <= n_len) {
+    while (n_cur <= n_predict) {
         // sample the next token
         {
             auto   n_vocab = llama_n_vocab(model);
@@ -134,7 +126,7 @@ int main(int argc, char ** argv) {
             const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
 
             // is it an end of generation?
-            if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
+            if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
                 LOG_TEE("\n");
 
                 break;
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 12e46fbc9..0939a1a6a 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -27,7 +27,8 @@ struct seq_draft {
 int main(int argc, char ** argv) {
     gpt_params params;
 
-    if (gpt_params_parse(argc, argv, params) == false) {
+    if (!gpt_params_parse(argc, argv, params)) {
+        gpt_params_print_usage(argc, argv, params);
         return 1;
     }
 
diff --git a/llama.cpp b/llama.cpp
index c05e2bdb7..06889126e 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -108,7 +108,7 @@
 //
 
 LLAMA_ATTRIBUTE_FORMAT(2, 3)
-static void llama_log_internal        (ggml_log_level level, const char* format, ...);
+static void llama_log_internal        (ggml_log_level level, const char * format, ...);
 static void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
 
 #define LLAMA_LOG_INFO(...)  llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
diff --git a/scripts/run-with-preset.py b/scripts/run-with-preset.py
index e986a3604..0d7219113 100755
--- a/scripts/run-with-preset.py
+++ b/scripts/run-with-preset.py
@@ -13,12 +13,12 @@ logger = logging.getLogger("run-with-preset")
 CLI_ARGS_MAIN_PERPLEXITY = [
     "batch-size", "cfg-negative-prompt", "cfg-scale", "chunks", "color", "ctx-size", "escape",
     "export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag",
-    "hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix", "instruct",
+    "hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix",
     "interactive", "interactive-first", "keep", "logdir", "logit-bias", "lora", "lora-base",
     "low-vram", "main-gpu", "memory-f32", "mirostat", "mirostat-ent", "mirostat-lr", "mlock",
     "model", "multiline-input", "n-gpu-layers", "n-predict", "no-mmap", "no-mul-mat-q",
     "np-penalize-nl", "numa", "ppl-output-type", "ppl-stride", "presence-penalty", "prompt",
-    "prompt-cache", "prompt-cache-all", "prompt-cache-ro", "random-prompt", "repeat-last-n",
+    "prompt-cache", "prompt-cache-all", "prompt-cache-ro", "repeat-last-n",
     "repeat-penalty", "reverse-prompt", "rope-freq-base", "rope-freq-scale", "rope-scale", "seed",
     "simple-io", "tensor-split", "threads", "temp", "tfs", "top-k", "top-p", "typical",
     "verbose-prompt"

From b90dc566c1c615289b05b50d61680f23744a21e7 Mon Sep 17 00:00:00 2001
From: agray3 <agray3@users.noreply.github.com>
Date: Tue, 4 Jun 2024 21:06:49 +0100
Subject: [PATCH 08/39] Allow number of nodes in CUDA graph to change (#7738)

Previously the code would have failed to cope in the case that the
number of nodes changes in an existing CUDA graph. This fixes the
issue by removing an unnecessary conditional.
---
 ggml-cuda.cu | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index daaa0cd6a..c81c6a0d7 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -2702,10 +2702,8 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
 
         if (cuda_graph_update_required) {
             // Extract nodes from graph
-            if (cuda_ctx->cuda_graph->num_nodes == 0) {
-                // First call with null argument gets number of nodes in graph
-                CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
-            }
+            // First call with null argument gets number of nodes in graph
+            CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
             // Subsequent call with non-null argument gets nodes
             cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes);
             cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes);

From c90dbe026b456a233f8f0fbe752212e6a0503ca2 Mon Sep 17 00:00:00 2001
From: jaime-m-p <167997752+jaime-m-p@users.noreply.github.com>
Date: Wed, 5 Jun 2024 01:26:14 +0200
Subject: [PATCH 09/39] Fix per token atrributes bits (#7749)

---
 llama.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/llama.h b/llama.h
index b2a302dad..9dcd67bef 100644
--- a/llama.h
+++ b/llama.h
@@ -109,16 +109,16 @@ extern "C" {
 
     enum llama_token_attr {
         LLAMA_TOKEN_ATTR_UNDEFINED    = 0,
-        LLAMA_TOKEN_ATTR_UNKNOWN      = 1 <<  1,
-        LLAMA_TOKEN_ATTR_UNUSED       = 1 <<  2,
-        LLAMA_TOKEN_ATTR_NORMAL       = 1 <<  3,
-        LLAMA_TOKEN_ATTR_CONTROL      = 1 <<  4,  // SPECIAL?
-        LLAMA_TOKEN_ATTR_USER_DEFINED = 1 <<  5,
-        LLAMA_TOKEN_ATTR_BYTE         = 1 <<  6,
-        LLAMA_TOKEN_ATTR_NORMALIZED   = 1 <<  7,
-        LLAMA_TOKEN_ATTR_LSTRIP       = 1 <<  8,
-        LLAMA_TOKEN_ATTR_RSTRIP       = 1 <<  9,
-        LLAMA_TOKEN_ATTR_SINGLE_WORD  = 1 << 10,
+        LLAMA_TOKEN_ATTR_UNKNOWN      = 1 << 0,
+        LLAMA_TOKEN_ATTR_UNUSED       = 1 << 1,
+        LLAMA_TOKEN_ATTR_NORMAL       = 1 << 2,
+        LLAMA_TOKEN_ATTR_CONTROL      = 1 << 3,  // SPECIAL?
+        LLAMA_TOKEN_ATTR_USER_DEFINED = 1 << 4,
+        LLAMA_TOKEN_ATTR_BYTE         = 1 << 5,
+        LLAMA_TOKEN_ATTR_NORMALIZED   = 1 << 6,
+        LLAMA_TOKEN_ATTR_LSTRIP       = 1 << 7,
+        LLAMA_TOKEN_ATTR_RSTRIP       = 1 << 8,
+        LLAMA_TOKEN_ATTR_SINGLE_WORD  = 1 << 9,
     };
 
     // model file types

From 9973e81c5ccf4f31b3980f5aa73f5cfea8699860 Mon Sep 17 00:00:00 2001
From: arch-btw <57669023+arch-btw@users.noreply.github.com>
Date: Tue, 4 Jun 2024 23:40:49 -0700
Subject: [PATCH 10/39] readme : remove -ins (#7759)

-ins and --instruct were moved in https://github.com/ggerganov/llama.cpp/pull/7675

I have adjusted the README accordingly.
There was no trace of --chatml in the README.
---
 examples/main/README.md | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/examples/main/README.md b/examples/main/README.md
index 4eaa68475..cdc002f15 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -69,7 +69,6 @@ In this section, we cover the most commonly used options for running the `main`
 -   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`; inferred from `--model-url` if set).
 -   `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
 -   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
--   `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
 -   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
 -   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
 
@@ -83,7 +82,7 @@ The `main` program provides several ways to interact with the LLaMA models using
 
 ## Interaction
 
-The `main` program offers a seamless way to interact with LLaMA models, allowing users to engage in real-time conversations or provide instructions for specific tasks. The interactive mode can be triggered using various options, including `--interactive`, `--interactive-first`, and `--instruct`.
+The `main` program offers a seamless way to interact with LLaMA models, allowing users to engage in real-time conversations or provide instructions for specific tasks. The interactive mode can be triggered using various options, including `--interactive` and `--interactive-first`.
 
 In interactive mode, users can participate in text generation by injecting their input during the process. Users can press `Ctrl+C` at any time to interject and type their input, followed by pressing `Return` to submit it to the LLaMA model. To submit additional lines without finalizing input, users can end the current line with a backslash (`\`) and continue typing.
 
@@ -91,7 +90,6 @@ In interactive mode, users can participate in text generation by injecting their
 
 -   `-i, --interactive`: Run the program in interactive mode, allowing users to engage in real-time conversations or provide specific instructions to the model.
 -   `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation.
--   `-ins, --instruct`: Run the program in instruction mode, which is specifically designed to work with Alpaca models that excel in completing tasks based on user instructions.
 -   `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text.
 
 By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
@@ -120,16 +118,6 @@ The `--in-suffix` flag is used to add a suffix after your input. This is useful
 ./main -r "User:" --in-prefix " " --in-suffix "Assistant:"
 ```
 
-### Instruction Mode
-
-Instruction mode is particularly useful when working with Alpaca models, which are designed to follow user instructions for specific tasks:
-
--   `-ins, --instruct`: Enable instruction mode to leverage the capabilities of Alpaca models in completing tasks based on user-provided instructions.
-
-Technical detail: the user's input is internally prefixed with the reverse prompt (or `### Instruction:` as the default), and followed by `### Response:` (except if you just press Return without any input, to keep generating a longer response).
-
-By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
-
 ## Context Management
 
 During text generation, LLaMA models have a limited context size, which means they can only consider a certain number of tokens from the input and generated text. When the context fills up, the model resets internally, potentially losing some information from the beginning of the conversation or instructions. Context management options help maintain continuity and coherence in these situations.

From 2b3389677a833cee0880226533a1768b1a9508d2 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 5 Jun 2024 11:29:20 +0300
Subject: [PATCH 11/39] ggml : refactor rope norm/neox (#7634)

* ggml : unify rope norm/neox (CPU)

* ggml : fix compile warning

* ggml : remove GLM rope mode

ggml-ci

* metal : better rope implementation

ggml-ci

* cuda : better rope implementation

ggml-ci

* naming : n_orig_ctx -> n_ctx_orig

ggml-ci

* dev : add reminders to update backends

ggml-ci

* vulkan : fix ggml_rope_ext() usage

* cuda : fix array size + indents

ggml-ci
---
 examples/baby-llama/baby-llama.cpp            |  12 +-
 examples/convert-legacy-llama.py              |  12 +-
 examples/finetune/finetune.cpp                |   2 +-
 .../train-text-from-scratch.cpp               |   2 +-
 ggml-cuda/rope.cu                             | 275 ++++++--------
 ggml-kompute.cpp                              |  13 +-
 ggml-metal.m                                  |  54 +--
 ggml-metal.metal                              | 200 ++++++-----
 ggml-sycl.cpp                                 |  74 +---
 ggml-vulkan.cpp                               |  22 +-
 ggml.c                                        | 334 ++++++------------
 ggml.h                                        |  36 +-
 kompute-shaders/op_rope_f16.comp              |   2 +-
 kompute-shaders/op_rope_f32.comp              |   2 +-
 kompute-shaders/rope_common.comp              |  12 +-
 llama.cpp                                     | 124 +++----
 tests/test-backend-ops.cpp                    |  31 +-
 tests/test-grad0.cpp                          |   4 +-
 tests/test-rope.cpp                           |   6 +-
 19 files changed, 485 insertions(+), 732 deletions(-)

diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp
index bf0125e75..4f6c3746a 100644
--- a/examples/baby-llama/baby-llama.cpp
+++ b/examples/baby-llama/baby-llama.cpp
@@ -522,8 +522,8 @@ static struct ggml_tensor * forward(
             // wk   shape [n_embd, n_embd, 1, 1]
             // Qcur shape [n_embd/n_head, n_head, N, 1]
             // Kcur shape [n_embd/n_head, n_head, N, 1]
-            struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0, 0);
-            struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0, 0);
+            struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0);
+            struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0);
 
             // store key and value to memory
             {
@@ -759,8 +759,8 @@ static struct ggml_tensor * forward_batch(
             // wk   shape [n_embd, n_embd, 1, 1]
             // Qcur shape [n_embd/n_head, n_head, N, n_batch]
             // Kcur shape [n_embd/n_head, n_head, N, n_batch]
-            struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0, 0);
-            struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0, 0);
+            struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0);
+            struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0);
             assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
             assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
 
@@ -1056,7 +1056,7 @@ static struct ggml_tensor * forward_lora(
                                                         model->layers[il].wqb,
                                                         cur)),
                                                 n_embd/n_head, n_head, N),
-                                            KQ_pos, n_rot, 0, 0);
+                                            KQ_pos, n_rot, 0);
             struct ggml_tensor * Kcur = ggml_rope(ctx0,
                                             ggml_reshape_3d(ctx0,
                                                 ggml_mul_mat(ctx0,
@@ -1065,7 +1065,7 @@ static struct ggml_tensor * forward_lora(
                                                         model->layers[il].wkb,
                                                         cur)),
                                                 n_embd/n_head, n_head, N),
-                                            KQ_pos, n_rot, 0, 0);
+                                            KQ_pos, n_rot, 0);
 
             // store key and value to memory
             {
diff --git a/examples/convert-legacy-llama.py b/examples/convert-legacy-llama.py
index fd8401015..721a57c00 100755
--- a/examples/convert-legacy-llama.py
+++ b/examples/convert-legacy-llama.py
@@ -176,7 +176,7 @@ class Params:
     rope_scaling_type: gguf.RopeScalingType | None = None
     f_rope_freq_base: float | None = None
     f_rope_scale: float | None = None
-    n_orig_ctx: int | None = None
+    n_ctx_orig: int | None = None
     rope_finetuned: bool | None = None
 
     ftype: GGMLFileType | None = None
@@ -226,7 +226,7 @@ class Params:
         with open(config_path) as f:
             config = json.load(f)
 
-        rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
+        rope_scaling_type = f_rope_scale = n_ctx_orig = rope_finetuned = None
         rope_scaling = config.get("rope_scaling")
 
         if rope_scaling is not None and (typ := rope_scaling.get("type")):
@@ -236,7 +236,7 @@ class Params:
                 rope_scaling_type = gguf.RopeScalingType.LINEAR
             elif typ == "yarn":
                 rope_scaling_type = gguf.RopeScalingType.YARN
-                n_orig_ctx = rope_scaling['original_max_position_embeddings']
+                n_ctx_orig = rope_scaling['original_max_position_embeddings']
                 rope_finetuned = rope_scaling['finetuned']
             else:
                 raise NotImplementedError(f'Unknown rope scaling type: {typ}')
@@ -272,7 +272,7 @@ class Params:
             f_rope_freq_base  = config.get("rope_theta"),
             rope_scaling_type = rope_scaling_type,
             f_rope_scale      = f_rope_scale,
-            n_orig_ctx        = n_orig_ctx,
+            n_ctx_orig        = n_ctx_orig,
             rope_finetuned    = rope_finetuned,
         )
 
@@ -864,8 +864,8 @@ class OutputFile:
             self.gguf.add_rope_scaling_type(params.rope_scaling_type)
             self.gguf.add_rope_scaling_factor(params.f_rope_scale)
 
-        if params.n_orig_ctx is not None:
-            self.gguf.add_rope_scaling_orig_ctx_len(params.n_orig_ctx)
+        if params.n_ctx_orig is not None:
+            self.gguf.add_rope_scaling_orig_ctx_len(params.n_ctx_orig)
 
         if params.rope_finetuned is not None:
             self.gguf.add_rope_scaling_finetuned(params.rope_finetuned)
diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 22425730f..71a4333ee 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -564,7 +564,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
         const int rope_mode = 0;
 
         return ggml_rope_ext(ctx,
-            t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, 0,
+            t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx,
             rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
         );
     };
diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp
index e2f85c682..b779f6bd4 100644
--- a/examples/train-text-from-scratch/train-text-from-scratch.cpp
+++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp
@@ -302,7 +302,7 @@ static struct ggml_tensor * llama_build_train_graphs(
         const int rope_mode = 0;
 
         return ggml_rope_ext(
-            ctx, t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, 0, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
+            ctx, t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
         );
     };
 
diff --git a/ggml-cuda/rope.cu b/ggml-cuda/rope.cu
index 0dd07977e..596fb7c13 100644
--- a/ggml-cuda/rope.cu
+++ b/ggml-cuda/rope.cu
@@ -1,7 +1,7 @@
 #include "rope.cuh"
 
 struct rope_corr_dims {
-    float v[4];
+    float v[2];
 };
 
 static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) {
@@ -13,8 +13,7 @@ static __device__ float rope_yarn_ramp(const float low, const float high, const
 // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
 static __device__ void rope_yarn(
     float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
-    float * cos_theta, float * sin_theta
-) {
+    float * cos_theta, float * sin_theta) {
     // Get n-d rotational scaling corrected for extrapolation
     float theta_interp = freq_scale * theta_extrap;
     float theta = theta_interp;
@@ -29,27 +28,38 @@ static __device__ void rope_yarn(
     *sin_theta = sinf(theta) * mscale;
 }
 
-// rope == RoPE == rotary positional embedding
-template<typename T, bool has_pos>
-static __global__ void rope(
-    const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
-    float ext_factor, float attn_factor, rope_corr_dims corr_dims
-) {
-    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
+template<typename T, bool has_ff>
+static __global__ void rope_norm(
+    const T * x, T * dst, int ne0, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
+    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors) {
+    const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
 
-    if (col >= ncols) {
+    if (i0 >= ne0) {
         return;
     }
 
     const int row = blockDim.x*blockIdx.x + threadIdx.x;
-    const int i = row*ncols + col;
+
+    if (i0 >= n_dims) {
+        const int i = row*ne0 + i0;
+
+        dst[i + 0] = x[i + 0];
+        dst[i + 1] = x[i + 1];
+
+        return;
+    }
+
+    const int i  = row*ne0 + i0;
     const int i2 = row/p_delta_rows;
 
-    const int p = has_pos ? pos[i2] : 0;
-    const float theta_base = p*powf(freq_base, -float(col)/ncols);
+    const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
 
-    float cos_theta, sin_theta;
-    rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);
+    const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
+
+    float cos_theta;
+    float sin_theta;
+
+    rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
 
     const float x0 = x[i + 0];
     const float x1 = x[i + 1];
@@ -58,23 +68,20 @@ static __global__ void rope(
     dst[i + 1] = x0*sin_theta + x1*cos_theta;
 }
 
-template<typename T, bool has_pos, bool has_freq_facs>
+template<typename T, bool has_ff>
 static __global__ void rope_neox(
-    const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors
-) {
-    const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
+    const T * x, T * dst, int ne0, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
+    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors) {
+    const int i0 = 2*(blockDim.y*blockIdx.y + threadIdx.y);
 
-    if (col >= ncols) {
+    if (i0 >= ne0) {
         return;
     }
 
     const int row = blockDim.x*blockIdx.x + threadIdx.x;
-    const int ib = col / n_dims;
-    const int ic = col % n_dims;
 
-    if (ib > 0) {
-        const int i = row*ncols + ib*n_dims + ic;
+    if (i0 >= n_dims) {
+        const int i = row*ne0 + i0;
 
         dst[i + 0] = x[i + 0];
         dst[i + 1] = x[i + 1];
@@ -82,16 +89,17 @@ static __global__ void rope_neox(
         return;
     }
 
-    const int i  = row*ncols + ib*n_dims + ic/2;
+    const int i  = row*ne0 + i0/2;
     const int i2 = row/p_delta_rows;
 
-    const int p = has_pos ? pos[i2] : 0;
-    const float freq_factor = has_freq_facs ? freq_factors[ic/2] : 1.0f;
+    const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
 
-    const float theta_base = p*powf(theta_scale, col/2.0f)/freq_factor;
+    const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
 
-    float cos_theta, sin_theta;
-    rope_yarn(theta_base, freq_scale, corr_dims, ic, ext_factor, attn_factor, &cos_theta, &sin_theta);
+    float cos_theta;
+    float sin_theta;
+
+    rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
 
     const float x0 = x[i + 0];
     const float x1 = x[i + n_dims/2];
@@ -100,144 +108,81 @@ static __global__ void rope_neox(
     dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
 }
 
-static __global__ void rope_glm_f32(
-    const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
-    int n_ctx
-) {
-    const int col = blockDim.x*blockIdx.x + threadIdx.x;
-    const int half_n_dims = ncols/4;
-
-    if (col >= half_n_dims) {
-        return;
-    }
-
-    const int row = blockDim.y*blockIdx.y + threadIdx.y;
-    const int i = row*ncols + col;
-    const int i2 = row/p_delta_rows;
-
-    const float col_theta_scale = powf(freq_base, -2.0f*col/ncols);
-     // FIXME: this is likely wrong
-    const int p = pos != nullptr ? pos[i2] : 0;
-
-    const float theta = min(p, n_ctx - 2)*freq_scale*col_theta_scale;
-    const float sin_theta = sinf(theta);
-    const float cos_theta = cosf(theta);
-
-    const float x0 = x[i + 0];
-    const float x1 = x[i + half_n_dims];
-
-    dst[i + 0]           = x0*cos_theta - x1*sin_theta;
-    dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
-
-    const float block_theta = ((float)max(p - n_ctx - 2, 0))*col_theta_scale;
-    const float sin_block_theta = sinf(block_theta);
-    const float cos_block_theta = cosf(block_theta);
-
-    const float x2 = x[i + half_n_dims * 2];
-    const float x3 = x[i + half_n_dims * 3];
-
-    dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
-    dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
-}
-
-
 template<typename T>
-static void rope_cuda(
-    const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
-) {
-    GGML_ASSERT(ncols % 2 == 0);
+static void rope_norm_cuda(
+    const T * x, T * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
+    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
+    GGML_ASSERT(ne0 % 2 == 0);
     const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
-    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
-    const dim3 block_nums(nrows, num_blocks_x, 1);
-    if (pos == nullptr) {
-        rope<T, false><<<block_nums, block_dims, 0, stream>>>(
-            x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
-        );
+    const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
+    const dim3 block_nums(nr, n_blocks_x, 1);
+
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
+
+    if (freq_factors == nullptr) {
+        rope_norm<T, false><<<block_nums, block_dims, 0, stream>>>(
+                x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
+                theta_scale, freq_factors
+                );
     } else {
-        rope<T, true><<<block_nums, block_dims, 0, stream>>>(
-            x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
-        );
+        rope_norm<T, true><<<block_nums, block_dims, 0, stream>>>(
+                x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
+                theta_scale, freq_factors
+                );
     }
 }
 
 template<typename T>
 static void rope_neox_cuda(
-    const T * x, T * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream
-) {
-    GGML_ASSERT(ncols % 2 == 0);
+    const T * x, T * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
+    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
+    GGML_ASSERT(ne0 % 2 == 0);
     const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
-    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
-    const dim3 block_nums(nrows, num_blocks_x, 1);
+    const int n_blocks_x = (ne0 + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
+    const dim3 block_nums(nr, n_blocks_x, 1);
 
     const float theta_scale = powf(freq_base, -2.0f/n_dims);
 
-    if (pos == nullptr) {
-        if (freq_factors == nullptr) {
-            rope_neox<T, false, false><<<block_nums, block_dims, 0, stream>>>(
-                x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
+    if (freq_factors == nullptr) {
+        rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
+                x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
                 theta_scale, freq_factors
                 );
-        } else {
-            rope_neox<T, false, true><<<block_nums, block_dims, 0, stream>>>(
-                x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
-                theta_scale, freq_factors
-                );
-        }
     } else {
-        if (freq_factors == nullptr) {
-            rope_neox<T, true, false><<<block_nums, block_dims, 0, stream>>>(
-                x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
+        rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
+                x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
                 theta_scale, freq_factors
                 );
-        } else {
-            rope_neox<T, true, true><<<block_nums, block_dims, 0, stream>>>(
-                x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
-                theta_scale, freq_factors
-                );
-        }
     }
 }
 
-static void rope_glm_f32_cuda(
-    const float * x, float * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float freq_base, int n_ctx, cudaStream_t stream
-) {
-    GGML_ASSERT(ncols % 4 == 0);
-    const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1);
-    const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
-    const dim3 block_nums(num_blocks_x, nrows, 1);
-    rope_glm_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, n_ctx);
+static void rope_norm_cuda_f16(
+    const half * x, half * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
+    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
+
+    rope_norm_cuda<half>(x, dst, ne0, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
 }
 
-static void rope_cuda_f16(
-    const half * x, half * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream) {
+static void rope_norm_cuda_f32(
+    const float * x, float * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
+    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
 
-    rope_cuda<half>(x, dst, ncols, nrows, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, stream);
-}
-
-static void rope_cuda_f32(
-    const float * x, float * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream) {
-
-    rope_cuda<float>(x, dst, ncols, nrows, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, stream);
+    rope_norm_cuda<float>(x, dst, ne0, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
 }
 
 static void rope_neox_cuda_f16(
-    const half * x, half * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
+    const half * x, half * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
     float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream) {
 
-    rope_neox_cuda<half>(x, dst, ncols, n_dims, nrows, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
+    rope_neox_cuda<half>(x, dst, ne0, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
 }
 
 static void rope_neox_cuda_f32(
-    const float * x, float * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
+    const float * x, float * dst, int ne0, int n_dims, int nr, const int32_t * pos, float freq_scale, int p_delta_rows,
     float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, cudaStream_t stream
 ) {
 
-    rope_neox_cuda<float>(x, dst, ncols, n_dims, nrows, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
+    rope_neox_cuda<float>(x, dst, ne0, n_dims, nr, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, stream);
 }
 
 void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@@ -258,16 +203,22 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 
     const int64_t ne00 = src0->ne[0];
     const int64_t ne01 = src0->ne[1];
-    const int64_t nrows = ggml_nrows(src0);
+    const int64_t nr = ggml_nrows(src0);
 
-    //const int n_past      = ((int32_t *) dst->op_params)[0];
-    const int n_dims      = ((int32_t *) dst->op_params)[1];
-    const int mode        = ((int32_t *) dst->op_params)[2];
-    const int n_ctx       = ((int32_t *) dst->op_params)[3];
-    const int n_orig_ctx  = ((int32_t *) dst->op_params)[4];
+    //const int n_past     = ((int32_t *) dst->op_params)[0];
+    const int n_dims     = ((int32_t *) dst->op_params)[1];
+    const int mode       = ((int32_t *) dst->op_params)[2];
+    //const int n_ctx      = ((int32_t *) dst->op_params)[3];
+    const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
 
     // RoPE alteration for extended context
-    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+    float freq_base;
+    float freq_scale;
+    float ext_factor;
+    float attn_factor;
+    float beta_fast;
+    float beta_slow;
+
     memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
     memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
     memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
@@ -275,38 +226,28 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
     memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
 
-    const float * freq_factors = nullptr;
-    const int32_t * pos = nullptr;
-
     const bool is_neox = mode & 2;
-    const bool is_glm  = mode & 4;
 
-    pos = (const int32_t *) src1_d;
+    const int32_t * pos = (const int32_t *) src1_d;
 
-    if (is_neox) {
-        if (src2 != nullptr) {
-            freq_factors = (const float *) src2->data;
-        }
-    } else {
-        GGML_ASSERT(src2 == nullptr && "TODO: freq_factors not implemented for !is_neox");
+    const float * freq_factors = nullptr;
+    if (src2 != nullptr) {
+        freq_factors = (const float *) src2->data;
     }
 
     rope_corr_dims corr_dims;
-    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
+    ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims.v);
 
     // compute
-    if (is_glm) {
-        GGML_ASSERT(false);
-        rope_glm_f32_cuda(src0_d, dst_d, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, stream);
-    } else if (is_neox) {
+    if (is_neox) {
         if (src0->type == GGML_TYPE_F32) {
             rope_neox_cuda_f32(
-                (const float *)src0_d, (float *)dst_d, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
+                (const float *)src0_d, (float *)dst_d, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
                 attn_factor, corr_dims, freq_factors, stream
             );
         } else if (src0->type == GGML_TYPE_F16) {
             rope_neox_cuda_f16(
-                (const half *)src0_d, (half *)dst_d, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
+                (const half *)src0_d, (half *)dst_d, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
                 attn_factor, corr_dims, freq_factors, stream
             );
         } else {
@@ -314,14 +255,14 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
         }
     } else {
         if (src0->type == GGML_TYPE_F32) {
-            rope_cuda_f32(
-                (const float *)src0_d, (float *)dst_d, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
-                attn_factor, corr_dims, stream
+            rope_norm_cuda_f32(
+                (const float *)src0_d, (float *)dst_d, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
+                attn_factor, corr_dims, freq_factors, stream
             );
         } else if (src0->type == GGML_TYPE_F16) {
-            rope_cuda_f16(
-                (const half *)src0_d, (half *)dst_d, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
-                attn_factor, corr_dims, stream
+            rope_norm_cuda_f16(
+                (const half *)src0_d, (half *)dst_d, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
+                attn_factor, corr_dims, freq_factors, stream
             );
         } else {
             GGML_ASSERT(false);
diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
index eabd70d5e..5592741be 100644
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
@@ -1192,7 +1192,7 @@ static void ggml_vk_rope(
     const std::shared_ptr<kp::Tensor>& inB,
     const std::shared_ptr<kp::Tensor>& out,
     uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
-    ggml_type src0t, int32_t n_dims, int32_t mode, int32_t n_orig_ctx,
+    ggml_type src0t, int32_t n_dims, int32_t mode, int32_t n_ctx_orig,
     float freq_base, float freq_scale, float ext_factor, float attn_factor, float beta_fast, float beta_slow,
     int32_t ne01, int32_t ne02, int32_t ne03,
     uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
@@ -1221,14 +1221,14 @@ static void ggml_vk_rope(
 
     struct PushConstants {
         uint32_t inAOff, inBOff, outOff;
-        int32_t n_dims, mode, n_orig_ctx;
+        int32_t n_dims, mode, n_ctx_orig;
         float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
         uint32_t nb00, nb01, nb02, nb03;
         int32_t ne0;
         uint32_t nb0, nb1, nb2, nb3;
     } pushConsts {
         safe_divide(inAOff, type_size), safe_divide(inBOff, 4), safe_divide(outOff, type_size),
-        n_dims, mode, n_orig_ctx,
+        n_dims, mode, n_ctx_orig,
         freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow,
         nb00, nb01, nb02, nb03,
         ne0,
@@ -1692,13 +1692,16 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
 #pragma message("      https://github.com/ggerganov/llama.cpp/pull/7225")
                         GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
 
+#pragma message("TODO: update rope NORM mode to match NEOX mode")
+#pragma message("      https://github.com/ggerganov/llama.cpp/pull/7634")
+
                         GGML_ASSERT(ne10 == ne02);
                         GGML_ASSERT(src0t == dstt);
                         // const int n_past = ((int32_t *) dst->op_params)[0];
                         const int n_dims     = ((int32_t *) dst->op_params)[1];
                         const int mode       = ((int32_t *) dst->op_params)[2];
                         // skip 3, n_ctx used in GLM RoPE, unimplemented in Vulkan
-                        const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
+                        const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
 
                         float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
                         memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
@@ -1708,7 +1711,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
                         memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
                         memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
                         ggml_vk_rope(
-                            seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, src0t, n_dims, mode, n_orig_ctx,
+                            seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, src0t, n_dims, mode, n_ctx_orig,
                             freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow,
                             ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, nb0, nb1, nb2, nb3
                         );
diff --git a/ggml-metal.m b/ggml-metal.m
index fddc44f78..946f11813 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -172,8 +172,10 @@ enum ggml_metal_kernel_type {
     GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32,
     GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32,
     GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32,
-    GGML_METAL_KERNEL_TYPE_ROPE_F32,
-    GGML_METAL_KERNEL_TYPE_ROPE_F16,
+    GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32,
+    GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16,
+    GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32,
+    GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16,
     GGML_METAL_KERNEL_TYPE_IM2COL_F16,
     GGML_METAL_KERNEL_TYPE_IM2COL_F32,
     GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
@@ -626,8 +628,10 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32,           mul_mm_id_iq1_m_f32,            ctx->support_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32,          mul_mm_id_iq4_nl_f32,           ctx->support_simdgroup_mm);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32,          mul_mm_id_iq4_xs_f32,           ctx->support_simdgroup_mm);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32,                      rope_f32,                       true);
-        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F16,                      rope_f16,                       true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32,                 rope_norm_f32,                  true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16,                 rope_norm_f16,                  true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32,                 rope_neox_f32,                  true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16,                 rope_neox_f16,                  true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F16,                    im2col_f16,                     true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32,                    im2col_f32,                     true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32,                   upscale_f32,                    true);
@@ -2285,7 +2289,7 @@ static enum ggml_status ggml_metal_graph_compute(
                         const int n_dims     = ((int32_t *) dst->op_params)[1];
                         const int mode       = ((int32_t *) dst->op_params)[2];
                         // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
-                        const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
+                        const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
 
                         float freq_base;
                         float freq_scale;
@@ -2302,21 +2306,22 @@ static enum ggml_status ggml_metal_graph_compute(
                         memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
 
                         const bool is_neox = mode & 2;
-                        const bool is_glm  = mode & 4;
-
-                        GGML_ASSERT(!is_glm && "GLM RoPE not implemented in Metal");
-
-                        if (!is_neox) {
-                            GGML_ASSERT(id_src2 == nil && "TODO: freq_factors not implemented for !is_neox");
-                        }
 
                         id<MTLComputePipelineState> pipeline = nil;
 
-                        switch (src0->type) {
-                            case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_F32].pipeline; break;
-                            case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_F16].pipeline; break;
-                            default: GGML_ASSERT(false);
-                        };
+                        if (!is_neox) {
+                            switch (src0->type) {
+                                case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break;
+                                case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break;
+                                default: GGML_ASSERT(false);
+                            };
+                        } else {
+                            switch (src0->type) {
+                                case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break;
+                                case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break;
+                                default: GGML_ASSERT(false);
+                            };
+                        }
 
                         [encoder setComputePipelineState:pipeline];
                         [encoder setBuffer:id_src0     offset:offs_src0        atIndex:0];
@@ -2345,14 +2350,13 @@ static enum ggml_status ggml_metal_graph_compute(
                         [encoder setBytes:&nb3         length:sizeof(uint64_t) atIndex:19];
                         [encoder setBytes:&n_past      length:sizeof(     int) atIndex:20];
                         [encoder setBytes:&n_dims      length:sizeof(     int) atIndex:21];
-                        [encoder setBytes:&mode        length:sizeof(     int) atIndex:22];
-                        [encoder setBytes:&n_orig_ctx  length:sizeof(     int) atIndex:23];
-                        [encoder setBytes:&freq_base   length:sizeof(   float) atIndex:24];
-                        [encoder setBytes:&freq_scale  length:sizeof(   float) atIndex:25];
-                        [encoder setBytes:&ext_factor  length:sizeof(   float) atIndex:26];
-                        [encoder setBytes:&attn_factor length:sizeof(   float) atIndex:27];
-                        [encoder setBytes:&beta_fast   length:sizeof(   float) atIndex:28];
-                        [encoder setBytes:&beta_slow   length:sizeof(   float) atIndex:29];
+                        [encoder setBytes:&n_ctx_orig  length:sizeof(     int) atIndex:22];
+                        [encoder setBytes:&freq_base   length:sizeof(   float) atIndex:23];
+                        [encoder setBytes:&freq_scale  length:sizeof(   float) atIndex:24];
+                        [encoder setBytes:&ext_factor  length:sizeof(   float) atIndex:25];
+                        [encoder setBytes:&attn_factor length:sizeof(   float) atIndex:26];
+                        [encoder setBytes:&beta_fast   length:sizeof(   float) atIndex:27];
+                        [encoder setBytes:&beta_slow   length:sizeof(   float) atIndex:28];
 
                         [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                     } break;
diff --git a/ggml-metal.metal b/ggml-metal.metal
index 0cb85e1a5..e2796fd60 100644
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@@ -1654,8 +1654,7 @@ static float rope_yarn_ramp(const float low, const float high, const int i0) {
 // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
 static void rope_yarn(
     float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale,
-    thread float * cos_theta, thread float * sin_theta
-) {
+    thread float * cos_theta, thread float * sin_theta) {
     // Get n-d rotational scaling corrected for extrapolation
     float theta_interp = freq_scale * theta_extrap;
     float theta = theta_interp;
@@ -1672,55 +1671,20 @@ static void rope_yarn(
 
 // Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
 // `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
-static float rope_yarn_corr_factor(int n_dims, int n_orig_ctx, float n_rot, float base) {
-    return n_dims * log(n_orig_ctx / (n_rot * 2 * M_PI_F)) / (2 * log(base));
+static float rope_yarn_corr_factor(int n_dims, int n_ctx_orig, float n_rot, float base) {
+    return n_dims * log(n_ctx_orig / (n_rot * 2 * M_PI_F)) / (2 * log(base));
 }
 
 static void rope_yarn_corr_dims(
-    int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
+    int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]
 ) {
     // start and end correction dims
-    dims[0] = max(0.0f,         floor(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_fast, freq_base)));
-    dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_slow, freq_base)));
+    dims[0] = max(0.0f,         floor(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_fast, freq_base)));
+    dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_slow, freq_base)));
 }
 
-typedef void (rope_t)(
-        device const    void * src0,
-        device const int32_t * src1,
-        device const   float * src2,
-        device         float * dst,
-        constant     int64_t & ne00,
-        constant     int64_t & ne01,
-        constant     int64_t & ne02,
-        constant     int64_t & ne03,
-        constant    uint64_t & nb00,
-        constant    uint64_t & nb01,
-        constant    uint64_t & nb02,
-        constant    uint64_t & nb03,
-        constant     int64_t & ne0,
-        constant     int64_t & ne1,
-        constant     int64_t & ne2,
-        constant     int64_t & ne3,
-        constant    uint64_t & nb0,
-        constant    uint64_t & nb1,
-        constant    uint64_t & nb2,
-        constant    uint64_t & nb3,
-        constant         int & n_past,
-        constant         int & n_dims,
-        constant         int & mode,
-        constant         int & n_orig_ctx,
-        constant       float & freq_base,
-        constant       float & freq_scale,
-        constant       float & ext_factor,
-        constant       float & attn_factor,
-        constant       float & beta_fast,
-        constant       float & beta_slow,
-        uint  tiitg[[thread_index_in_threadgroup]],
-        uint3 tptg[[threads_per_threadgroup]],
-        uint3 tgpig[[threadgroup_position_in_grid]]);
-
 template<typename T>
-kernel void kernel_rope(
+kernel void kernel_rope_norm(
         device const    void * src0,
         device const int32_t * src1,
         device const   float * src2,
@@ -1743,8 +1707,7 @@ kernel void kernel_rope(
         constant    uint64_t & nb3,
         constant         int & n_past,
         constant         int & n_dims,
-        constant         int & mode,
-        constant         int & n_orig_ctx,
+        constant         int & n_ctx_orig,
         constant       float & freq_base,
         constant       float & freq_scale,
         constant       float & ext_factor,
@@ -1758,69 +1721,130 @@ kernel void kernel_rope(
     const int64_t i2 = tgpig[1];
     const int64_t i1 = tgpig[0];
 
-    const bool is_neox = mode & 2;
-
     float corr_dims[2];
-    rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
+    rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
 
     device const int32_t * pos = src1;
 
-    const int64_t p = pos[i2];
-
-    const float theta_base = (float)p;
+    const float theta_base = (float) pos[i2];
     const float inv_ndims = -1.f/n_dims;
 
-    if (!is_neox) {
-        for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
+    float cos_theta;
+    float sin_theta;
+
+    for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
+        if (i0 < n_dims) {
+            const int64_t ic = i0/2;
+
             const float theta = theta_base * pow(freq_base, inv_ndims*i0);
 
-            float cos_theta, sin_theta;
-            rope_yarn(theta, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
+            const float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
+
+            rope_yarn(theta/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
 
             device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
             device       T * dst_data  = (device T *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
 
-            const T x0 = src[0];
-            const T x1 = src[1];
+            const float x0 = src[0];
+            const float x1 = src[1];
 
             dst_data[0] = x0*cos_theta - x1*sin_theta;
             dst_data[1] = x0*sin_theta + x1*cos_theta;
-        }
-    } else {
-        for (int64_t ic = 2*tiitg; ic < ne0; ic += 2*tptg.x) {
-            if (ic < n_dims) {
-                const int64_t i0 = ic/2;
+        } else {
+            device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+            device       T * dst_data  = (device T *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
 
-                const float freq_factor = src2 != src0 ? src2[i0] : 1.0f;
-
-                const float theta = theta_base * pow(freq_base, inv_ndims*ic);
-
-                float cos_theta, sin_theta;
-                rope_yarn(theta/freq_factor, freq_scale, corr_dims, ic, ext_factor, attn_factor, &cos_theta, &sin_theta);
-
-                device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                device       T * dst_data  = (device T *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-
-                const float x0 = src[0];
-                const float x1 = src[n_dims/2];
-
-                dst_data[0]        = x0*cos_theta - x1*sin_theta;
-                dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
-            } else {
-                const int64_t i0 = ic;
-
-                device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                device       T * dst_data  = (device T *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-
-                dst_data[0] = src[0];
-                dst_data[1] = src[1];
-            }
+            dst_data[0] = src[0];
+            dst_data[1] = src[1];
         }
     }
 }
 
-template [[host_name("kernel_rope_f32")]] kernel rope_t kernel_rope<float>;
-template [[host_name("kernel_rope_f16")]] kernel rope_t kernel_rope<half>;
+template<typename T>
+kernel void kernel_rope_neox(
+        device const    void * src0,
+        device const int32_t * src1,
+        device const   float * src2,
+        device         float * dst,
+        constant     int64_t & ne00,
+        constant     int64_t & ne01,
+        constant     int64_t & ne02,
+        constant     int64_t & ne03,
+        constant    uint64_t & nb00,
+        constant    uint64_t & nb01,
+        constant    uint64_t & nb02,
+        constant    uint64_t & nb03,
+        constant     int64_t & ne0,
+        constant     int64_t & ne1,
+        constant     int64_t & ne2,
+        constant     int64_t & ne3,
+        constant    uint64_t & nb0,
+        constant    uint64_t & nb1,
+        constant    uint64_t & nb2,
+        constant    uint64_t & nb3,
+        constant         int & n_past,
+        constant         int & n_dims,
+        constant         int & n_ctx_orig,
+        constant       float & freq_base,
+        constant       float & freq_scale,
+        constant       float & ext_factor,
+        constant       float & attn_factor,
+        constant       float & beta_fast,
+        constant       float & beta_slow,
+        uint  tiitg[[thread_index_in_threadgroup]],
+        uint3 tptg[[threads_per_threadgroup]],
+        uint3 tgpig[[threadgroup_position_in_grid]]) {
+    const int64_t i3 = tgpig[2];
+    const int64_t i2 = tgpig[1];
+    const int64_t i1 = tgpig[0];
+
+    float corr_dims[2];
+    rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
+
+    device const int32_t * pos = src1;
+
+    const float theta_base = (float) pos[i2];
+    const float inv_ndims = -1.f/n_dims;
+
+    float cos_theta;
+    float sin_theta;
+
+    for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
+        if (i0 < n_dims) {
+            const int64_t ic = i0/2;
+
+            const float theta = theta_base * pow(freq_base, inv_ndims*i0);
+
+            const float freq_factor = src2 != src0 ? src2[ic] : 1.0f;
+
+            rope_yarn(theta/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
+
+            device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
+            device       T * dst_data  = (device T *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
+
+            const float x0 = src[0];
+            const float x1 = src[n_dims/2];
+
+            dst_data[0]        = x0*cos_theta - x1*sin_theta;
+            dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
+        } else {
+            device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+            device       T * dst_data  = (device T *)((device char *)  dst + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+            dst_data[0] = src[0];
+            dst_data[1] = src[1];
+        }
+    }
+}
+
+typedef decltype(kernel_rope_norm<float>) kernel_rope_norm_t;
+typedef decltype(kernel_rope_neox<float>) kernel_rope_neox_t;
+
+template [[host_name("kernel_rope_norm_f32")]] kernel kernel_rope_norm_t kernel_rope_norm<float>;
+template [[host_name("kernel_rope_norm_f16")]] kernel kernel_rope_norm_t kernel_rope_norm<half>;
+
+template [[host_name("kernel_rope_neox_f32")]] kernel kernel_rope_neox_t kernel_rope_neox<float>;
+template [[host_name("kernel_rope_neox_f16")]] kernel kernel_rope_neox_t kernel_rope_neox<half>;
 
 typedef void (im2col_t)(
         device const float * x,
diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 5cd97e4ff..3ff76474d 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -8928,49 +8928,6 @@ static void rope_neox(
     dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
 }
 
-static void rope_glm_f32(
-    const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
-    int n_ctx
-, const sycl::nd_item<3> &item_ct1) {
-    const int col = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                    item_ct1.get_local_id(2);
-    const int half_n_dims = ncols/4;
-
-    if (col >= half_n_dims) {
-        return;
-    }
-
-    const int row = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                    item_ct1.get_local_id(1);
-    const int i = row*ncols + col;
-    const int i2 = row/p_delta_rows;
-
-    const float col_theta_scale = dpct::pow(freq_base, -2.0f * col / ncols);
-     // FIXME: this is likely wrong
-    const int p = pos != nullptr ? pos[i2] : 0;
-
-    const float theta = sycl::min(p, n_ctx - 2) * freq_scale * col_theta_scale;
-    const float sin_theta = sycl::sin((float)theta);
-    const float cos_theta = sycl::cos((float)theta);
-
-    const float x0 = x[i + 0];
-    const float x1 = x[i + half_n_dims];
-
-    dst[i + 0]           = x0*cos_theta - x1*sin_theta;
-    dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
-
-    const float block_theta =
-        ((float)sycl::max(p - n_ctx - 2, 0)) * col_theta_scale;
-    const float sin_block_theta = sycl::sin((float)block_theta);
-    const float cos_block_theta = sycl::cos((float)block_theta);
-
-    const float x2 = x[i + half_n_dims * 2];
-    const float x3 = x[i + half_n_dims * 3];
-
-    dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
-    dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
-}
-
 static void k_sum_rows_f32(const float * x, float * dst, const int ncols,
                            const sycl::nd_item<3> &item_ct1) {
     const int row = item_ct1.get_group(1);
@@ -12520,22 +12477,6 @@ static void rope_neox_sycl(const T *x, T *dst, int ncols, int n_dims, int nrows,
     }
 }
 
-static void rope_glm_f32_sycl(const float *x, float *dst, int ncols, int nrows,
-                              const int32_t *pos, float freq_scale,
-                              int p_delta_rows, float freq_base, int n_ctx,
-                              dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % 4 == 0);
-    const sycl::range<3> block_dims(1, 1, SYCL_ROPE_BLOCK_SIZE / 4);
-    const int num_blocks_x = (ncols + SYCL_ROPE_BLOCK_SIZE - 1) / SYCL_ROPE_BLOCK_SIZE;
-    const sycl::range<3> block_nums(1, nrows, num_blocks_x);
-    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             rope_glm_f32(x, dst, ncols, pos, freq_scale,
-                                          p_delta_rows, freq_base, n_ctx,
-                                          item_ct1);
-                         });
-}
-
 static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols,
                               const int nrows, dpct::queue_ptr stream) {
     const sycl::range<3> block_dims(1, 1, WARP_SIZE);
@@ -14066,8 +14007,8 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
     //const int n_past      = ((int32_t *) dst->op_params)[0];
     const int n_dims      = ((int32_t *) dst->op_params)[1];
     const int mode        = ((int32_t *) dst->op_params)[2];
-    const int n_ctx       = ((int32_t *) dst->op_params)[3];
-    const int n_orig_ctx  = ((int32_t *) dst->op_params)[4];
+    //const int n_ctx       = ((int32_t *) dst->op_params)[3];
+    const int n_ctx_orig  = ((int32_t *) dst->op_params)[4];
 
     // RoPE alteration for extended context
     float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
@@ -14087,7 +14028,9 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
     }
 
     const bool is_neox = mode & 2;
-    const bool is_glm  = mode & 4;
+
+#pragma message("TODO: update rope NORM mode to match NEOX mode")
+#pragma message("      https://github.com/ggerganov/llama.cpp/pull/7634")
 
     if (is_neox) {
         pos = (const int32_t *) src1_dd;
@@ -14100,13 +14043,10 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
     }
 
     rope_corr_dims corr_dims;
-    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
+    ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims.v);
 
     // compute
-    if (is_glm) {
-        GGML_ASSERT(false);
-        rope_glm_f32_sycl(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
-    } else if (is_neox) {
+    if (is_neox) {
         if (src0->type == GGML_TYPE_F32) {
             rope_neox_sycl(
                 (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index 5e12ea9dd..e0c512c0d 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -3898,11 +3898,6 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
         {
             const int mode = ((const int32_t *) dst->op_params)[2];
             const bool is_neox = mode & 2;
-            const bool is_glm  = mode & 4;
-
-            if (is_glm) {
-                return nullptr;
-            }
 
             if (is_neox) {
                 if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
@@ -4401,7 +4396,7 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
     const int n_dims        = ((int32_t *) dst->op_params)[1];
     const int mode          = ((int32_t *) dst->op_params)[2];
     // const int n_ctx         = ((int32_t *) dst->op_params)[3];
-    const int n_orig_ctx    = ((int32_t *) dst->op_params)[4];
+    const int n_ctx_orig    = ((int32_t *) dst->op_params)[4];
     const float freq_base   = ((float *)   dst->op_params)[5];
     const float freq_scale  = ((float *)   dst->op_params)[6];
     const float ext_factor  = ((float *)   dst->op_params)[7];
@@ -4410,12 +4405,12 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con
     const float beta_slow   = ((float *)   dst->op_params)[10];
 
     const bool is_neox = mode & 2;
-    const bool is_glm  = mode & 4;
 
-    GGML_ASSERT(!is_glm);
+#pragma message("TODO: update rope NORM mode to match NEOX mode")
+#pragma message("      https://github.com/ggerganov/llama.cpp/pull/7634")
 
     float corr_dims[2];
-    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
+    ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
 
     if (is_neox) {
         const float theta_scale = powf(freq_base, -2.0f/n_dims);
@@ -6485,9 +6480,8 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
         case GGML_OP_ROPE:
             {
                 const int mode = ((const int32_t *) op->op_params)[2];
-                const bool is_glm  = mode & 4;
 
-                return !is_glm;
+                return true;
             } break;
         case GGML_OP_NONE:
         case GGML_OP_RESHAPE:
@@ -6992,15 +6986,15 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
     } else if (tensor->op == GGML_OP_ROPE) {
         const int n_dims      = ((int32_t *) tensor->op_params)[1];
         const int mode        = ((int32_t *) tensor->op_params)[2];
-        const int n_ggml_ctx       = ((int32_t *) tensor->op_params)[3];
-        const int n_orig_ggml_ctx  = ((int32_t *) tensor->op_params)[4];
+        //const int n_ctx_ggml       = ((int32_t *) tensor->op_params)[3];
+        const int n_ctx_orig_ggml  = ((int32_t *) tensor->op_params)[4];
         float freq_base       = ((float *)   tensor->op_params)[5];
         float freq_scale      = ((float *)   tensor->op_params)[6];
         float ext_factor      = ((float *)   tensor->op_params)[7];
         float attn_factor     = ((float *)   tensor->op_params)[8];
         float beta_fast       = ((float *)   tensor->op_params)[9];
         float beta_slow       = ((float *)   tensor->op_params)[10];
-        tensor_clone = ggml_rope_ext(ggml_ctx, src0_clone, src1_clone, src2_clone, n_dims, mode, n_ggml_ctx, n_orig_ggml_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
+        tensor_clone = ggml_rope_ext(ggml_ctx, src0_clone, src1_clone, src2_clone, n_dims, mode, n_ctx_orig_ggml, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
     } else if (tensor->op == GGML_OP_UNARY) {
         switch (ggml_get_unary_op(tensor)) {
         case GGML_UNARY_OP_SILU:
diff --git a/ggml.c b/ggml.c
index 11e5c34ab..1fc77743b 100644
--- a/ggml.c
+++ b/ggml.c
@@ -6250,16 +6250,13 @@ static struct ggml_tensor * ggml_rope_impl(
         struct ggml_tensor  * c,
         int                   n_dims,
         int                   mode,
-        int                   n_ctx,
-        int                   n_orig_ctx,
+        int                   n_ctx_orig,
         float                 freq_base,
         float                 freq_scale,
         float                 ext_factor,
         float                 attn_factor,
         float                 beta_fast,
         float                 beta_slow,
-        float                 xpos_base,
-        bool                  xpos_down,
         bool                  inplace) {
     GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
 
@@ -6280,15 +6277,13 @@ static struct ggml_tensor * ggml_rope_impl(
 
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
 
-    int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx };
+    int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
     memcpy(params +  5, &freq_base,    sizeof(float));
     memcpy(params +  6, &freq_scale,   sizeof(float));
     memcpy(params +  7, &ext_factor,   sizeof(float));
     memcpy(params +  8, &attn_factor,  sizeof(float));
     memcpy(params +  9, &beta_fast,    sizeof(float));
     memcpy(params + 10, &beta_slow,    sizeof(float));
-    memcpy(params + 11, &xpos_base,    sizeof(float));
-    memcpy(params + 12, &xpos_down,    sizeof(bool));
     ggml_set_op_params(result, params, sizeof(params));
 
     result->op   = GGML_OP_ROPE;
@@ -6305,10 +6300,9 @@ struct ggml_tensor * ggml_rope(
         struct ggml_tensor  * a,
         struct ggml_tensor  * b,
         int                   n_dims,
-        int                   mode,
-        int                   n_ctx) {
+        int                   mode) {
     return ggml_rope_impl(
-        ctx, a, b, NULL, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, false
+        ctx, a, b, NULL, n_dims, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, false
     );
 }
 
@@ -6317,10 +6311,9 @@ struct ggml_tensor * ggml_rope_inplace(
         struct ggml_tensor  * a,
         struct ggml_tensor  * b,
         int                   n_dims,
-        int                   mode,
-        int                   n_ctx) {
+        int                   mode) {
     return ggml_rope_impl(
-        ctx, a, b, NULL, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, false, true
+        ctx, a, b, NULL, n_dims, mode, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, true
     );
 }
 
@@ -6331,8 +6324,7 @@ struct ggml_tensor * ggml_rope_ext(
         struct ggml_tensor  * c,
         int                   n_dims,
         int                   mode,
-        int                   n_ctx,
-        int                   n_orig_ctx,
+        int                   n_ctx_orig,
         float                 freq_base,
         float                 freq_scale,
         float                 ext_factor,
@@ -6340,8 +6332,8 @@ struct ggml_tensor * ggml_rope_ext(
         float                 beta_fast,
         float                 beta_slow) {
     return ggml_rope_impl(
-        ctx, a, b, c, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
-        ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, false
+        ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
+        ext_factor, attn_factor, beta_fast, beta_slow, false
     );
 }
 
@@ -6352,8 +6344,7 @@ struct ggml_tensor * ggml_rope_ext_inplace(
         struct ggml_tensor  * c,
         int                   n_dims,
         int                   mode,
-        int                   n_ctx,
-        int                   n_orig_ctx,
+        int                   n_ctx_orig,
         float                 freq_base,
         float                 freq_scale,
         float                 ext_factor,
@@ -6361,8 +6352,8 @@ struct ggml_tensor * ggml_rope_ext_inplace(
         float                 beta_fast,
         float                 beta_slow) {
     return ggml_rope_impl(
-        ctx, a, b, c, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
-        ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, true
+        ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
+        ext_factor, attn_factor, beta_fast, beta_slow, true
     );
 }
 
@@ -6372,8 +6363,7 @@ struct ggml_tensor * ggml_rope_custom(
         struct ggml_tensor  * b,
         int                   n_dims,
         int                   mode,
-        int                   n_ctx,
-        int                   n_orig_ctx,
+        int                   n_ctx_orig,
         float                 freq_base,
         float                 freq_scale,
         float                 ext_factor,
@@ -6381,8 +6371,8 @@ struct ggml_tensor * ggml_rope_custom(
         float                 beta_fast,
         float                 beta_slow) {
     return ggml_rope_impl(
-        ctx, a, b, NULL, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
-        ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, false
+        ctx, a, b, NULL, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
+        ext_factor, attn_factor, beta_fast, beta_slow, false
     );
 }
 
@@ -6392,8 +6382,7 @@ struct ggml_tensor * ggml_rope_custom_inplace(
         struct ggml_tensor  * b,
         int                   n_dims,
         int                   mode,
-        int                   n_ctx,
-        int                   n_orig_ctx,
+        int                   n_ctx_orig,
         float                 freq_base,
         float                 freq_scale,
         float                 ext_factor,
@@ -6401,21 +6390,11 @@ struct ggml_tensor * ggml_rope_custom_inplace(
         float                 beta_fast,
         float                 beta_slow) {
     return ggml_rope_impl(
-        ctx, a, b, NULL, n_dims, mode, n_ctx, n_orig_ctx, freq_base, freq_scale,
-        ext_factor, attn_factor, beta_fast, beta_slow, 0.0f, false, true
+        ctx, a, b, NULL, n_dims, mode, n_ctx_orig, freq_base, freq_scale,
+        ext_factor, attn_factor, beta_fast, beta_slow, true
     );
 }
 
-struct ggml_tensor * ggml_rope_xpos_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        int                   n_dims,
-        float                 base,
-        bool                  down) {
-    return ggml_rope_impl(ctx, a, b, NULL, n_dims, 0, 0, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f, base, down, true);
-}
-
 // ggml_rope_back
 
 struct ggml_tensor * ggml_rope_back(
@@ -6425,16 +6404,13 @@ struct ggml_tensor * ggml_rope_back(
         struct ggml_tensor  * c,
         int                   n_dims,
         int                   mode,
-        int                   n_ctx,
-        int                   n_orig_ctx,
+        int                   n_ctx_orig,
         float                 freq_base,
         float                 freq_scale,
         float                 ext_factor,
         float                 attn_factor,
         float                 beta_fast,
-        float                 beta_slow,
-        float                 xpos_base,
-        bool                  xpos_down) {
+        float                 beta_slow) {
     GGML_ASSERT(ggml_is_vector(b));
     GGML_ASSERT(b->type == GGML_TYPE_I32);
     GGML_ASSERT(a->ne[2] == b->ne[0]);
@@ -6450,15 +6426,13 @@ struct ggml_tensor * ggml_rope_back(
 
     struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
 
-    int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx };
+    int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
     memcpy(params +  5, &freq_base,    sizeof(float));
     memcpy(params +  6, &freq_scale,   sizeof(float));
     memcpy(params +  7, &ext_factor,   sizeof(float));
     memcpy(params +  8, &attn_factor,  sizeof(float));
     memcpy(params +  9, &beta_fast,    sizeof(float));
     memcpy(params + 10, &beta_slow,    sizeof(float));
-    memcpy(params + 11, &xpos_base,    sizeof(float));
-    memcpy(params + 12, &xpos_down,    sizeof(bool));
     ggml_set_op_params(result, params, sizeof(params));
 
     result->op   = GGML_OP_ROPE_BACK;
@@ -14227,8 +14201,7 @@ static float rope_yarn_ramp(const float low, const float high, const int i0) {
 // MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
 static void rope_yarn(
     float theta_extrap, float freq_scale, float corr_dims[2], int64_t i0, float ext_factor, float mscale,
-    float * cos_theta, float * sin_theta
-) {
+    float * cos_theta, float * sin_theta) {
     // Get n-d rotational scaling corrected for extrapolation
     float theta_interp = freq_scale * theta_extrap;
     float theta = theta_interp;
@@ -14245,18 +14218,19 @@ static void rope_yarn(
 
 // Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
 // `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
-static float ggml_rope_yarn_corr_dim(int n_dims, int n_orig_ctx, float n_rot, float base) {
-    return n_dims * logf(n_orig_ctx / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
+static float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) {
+    return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
 }
 
 static void ggml_rope_cache_init(
-     float theta_base, float freq_scale, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
-     float * cache, float sin_sign, float theta_scale
-) {
+     float theta_base, float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
+     float * cache, float sin_sign, float theta_scale) {
+    // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
     float theta = theta_base;
     for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
+        const float ff = freq_factors ? freq_factors[i0/2] : 1.0f;
         rope_yarn(
-            theta, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
+            theta/ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
         );
         cache[i0 + 1] *= sin_sign;
 
@@ -14265,11 +14239,11 @@ static void ggml_rope_cache_init(
 }
 
 GGML_CALL void ggml_rope_yarn_corr_dims(
-    int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
+    int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]
 ) {
     // start and end correction dims
-    float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_fast, freq_base));
-    float end   =  ceilf(ggml_rope_yarn_corr_dim(n_dims, n_orig_ctx, beta_slow, freq_base));
+    float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base));
+    float end   =  ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base));
     dims[0] = MAX(0, start);
     dims[1] = MIN(n_dims - 1, end);
 }
@@ -14289,15 +14263,11 @@ static void ggml_compute_forward_rope_f32(
 
     float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
 
-    // these two only relevant for xPos RoPE:
-    float xpos_base;
-    bool  xpos_down;
-
     //const int n_past     = ((int32_t *) dst->op_params)[0];
     const int n_dims     = ((int32_t *) dst->op_params)[1];
     const int mode       = ((int32_t *) dst->op_params)[2];
-    const int n_ctx      = ((int32_t *) dst->op_params)[3];
-    const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
+    //const int n_ctx      = ((int32_t *) dst->op_params)[3];
+    const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
 
     memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
     memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
@@ -14305,8 +14275,6 @@ static void ggml_compute_forward_rope_f32(
     memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
     memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
     memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
-    memcpy(&xpos_base,   (int32_t *) dst->op_params + 11, sizeof(float));
-    memcpy(&xpos_down,   (int32_t *) dst->op_params + 12, sizeof(bool));
 
     GGML_TENSOR_UNARY_OP_LOCALS
 
@@ -14336,20 +14304,15 @@ static void ggml_compute_forward_rope_f32(
     const float theta_scale = powf(freq_base, -2.0f/n_dims);
 
     float corr_dims[2];
-    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
+    ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
 
     const bool is_neox = mode & 2;
-    const bool is_glm  = mode & 4;
 
     const float * freq_factors = NULL;
-    if (is_neox) {
-        if (src2 != NULL) {
-            GGML_ASSERT(src2->type == GGML_TYPE_F32);
-            GGML_ASSERT(src2->ne[0] >= n_dims / 2);
-            freq_factors = (const float *) src2->data;
-        }
-    } else {
-        GGML_ASSERT(src2 == NULL && "TODO: freq_factors not implemented for !is_neox");
+    if (src2 != NULL) {
+        GGML_ASSERT(src2->type == GGML_TYPE_F32);
+        GGML_ASSERT(src2->ne[0] >= n_dims / 2);
+        freq_factors = (const float *) src2->data;
     }
 
     // backward process uses inverse rotation by cos and sin.
@@ -14364,95 +14327,51 @@ static void ggml_compute_forward_rope_f32(
             const int64_t p = pos[i2];
 
             float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
-            if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox
-                ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
-            }
+            ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
 
             for (int64_t i1 = 0; i1 < ne1; i1++) {
                 if (ir++ < ir0) continue;
                 if (ir   > ir1) break;
 
-                float theta_base = (float)p;
-
-                if (is_glm) {
-                    theta_base = MIN(p, n_ctx - 2);
-                    float block_theta = MAX(p - (n_ctx - 2), 0);
-                    for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
-                        const float cos_theta = cosf(theta_base);
-                        const float sin_theta = sinf(theta_base) * sin_sign;
-                        const float cos_block_theta = cosf(block_theta);
-                        const float sin_block_theta = sinf(block_theta) * sin_sign;
-
-                        theta_base  *= theta_scale;
-                        block_theta *= theta_scale;
-
-                        const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                              float * dst_data  = (float *)((char *)  dst->data +  i3*nb3 + i2*nb2  + i1*nb1  + i0*nb0);
-
-                        const float x0 = src[0];
-                        const float x1 = src[n_dims/2];
-                        const float x2 = src[n_dims];
-                        const float x3 = src[n_dims/2*3];
-
-                        dst_data[0]          = x0*cos_theta - x1*sin_theta;
-                        dst_data[n_dims/2]   = x0*sin_theta + x1*cos_theta;
-                        dst_data[n_dims]     = x2*cos_block_theta - x3*sin_block_theta;
-                        dst_data[n_dims/2*3] = x2*sin_block_theta + x3*cos_block_theta;
-                    }
-                } else if (!is_neox) {
-                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
+                if (!is_neox) {
+                    for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
                         const float cos_theta = cache[i0 + 0];
                         const float sin_theta = cache[i0 + 1];
 
-                        // zeta scaling for xPos only:
-                        float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
-                        if (xpos_down) zeta = 1.0f / zeta;
-
                         const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
                               float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
 
                         const float x0 = src[0];
                         const float x1 = src[1];
 
-                        dst_data[0] = x0*cos_theta*zeta - x1*sin_theta*zeta;
-                        dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
+                        dst_data[0] = x0*cos_theta - x1*sin_theta;
+                        dst_data[1] = x0*sin_theta + x1*cos_theta;
                     }
                 } else {
-                    // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
-                    for (int64_t ic = 0; ic < ne0; ic += 2) {
-                        if (ic < n_dims) {
-                            const int64_t i0 = ic/2;
+                    for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
+                        const int64_t ic = i0/2;
 
-                            const float freq_factor = freq_factors ? freq_factors[i0] : 1.0f;
+                        const float cos_theta = cache[i0 + 0];
+                        const float sin_theta = cache[i0 + 1];
 
-                            float cos_theta, sin_theta;
-                            rope_yarn(
-                                theta_base/freq_factor, freq_scale, corr_dims, ic, ext_factor, attn_factor,
-                                &cos_theta, &sin_theta
-                            );
+                        const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
+                        float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
 
-                            sin_theta  *= sin_sign;
-                            theta_base *= theta_scale;
+                        const float x0 = src[0];
+                        const float x1 = src[n_dims/2];
 
-                            const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                                  float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-
-                            const float x0 = src[0];
-                            const float x1 = src[n_dims/2];
-
-                            dst_data[0]        = x0*cos_theta - x1*sin_theta;
-                            dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
-                        } else {
-                            const int64_t i0 = ic;
-
-                            const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                                  float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-
-                            dst_data[0] = src[0];
-                            dst_data[1] = src[1];
-                        }
+                        dst_data[0]        = x0*cos_theta - x1*sin_theta;
+                        dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
                     }
                 }
+
+                for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
+                    const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                    float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+                    dst_data[0] = src[0];
+                    dst_data[1] = src[1];
+                }
             }
         }
     }
@@ -14477,8 +14396,8 @@ static void ggml_compute_forward_rope_f16(
     //const int n_past     = ((int32_t *) dst->op_params)[0];
     const int n_dims     = ((int32_t *) dst->op_params)[1];
     const int mode       = ((int32_t *) dst->op_params)[2];
-    const int n_ctx      = ((int32_t *) dst->op_params)[3];
-    const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
+    //const int n_ctx      = ((int32_t *) dst->op_params)[3];
+    const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
     memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
     memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
     memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
@@ -14514,20 +14433,15 @@ static void ggml_compute_forward_rope_f16(
     const float theta_scale = powf(freq_base, -2.0f/n_dims);
 
     float corr_dims[2];
-    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
+    ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
 
     const bool is_neox = mode & 2;
-    const bool is_glm  = mode & 4;
 
     const float * freq_factors = NULL;
-    if (is_neox) {
-        if (src2 != NULL) {
-            GGML_ASSERT(src2->type == GGML_TYPE_F32);
-            GGML_ASSERT(src2->ne[0] >= n_dims / 2);
-            freq_factors = (const float *) src2->data;
-        }
-    } else {
-        GGML_ASSERT(src2 == NULL && "TODO: freq_factors not implemented for !is_neox");
+    if (src2 != NULL) {
+        GGML_ASSERT(src2->type == GGML_TYPE_F32);
+        GGML_ASSERT(src2->ne[0] >= n_dims / 2);
+        freq_factors = (const float *) src2->data;
     }
 
     // backward process uses inverse rotation by cos and sin.
@@ -14542,43 +14456,14 @@ static void ggml_compute_forward_rope_f16(
             const int64_t p = pos[i2];
 
             float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
-            if (!is_glm && !is_neox) { // TODO: cache sin/cos for glm, neox
-                ggml_rope_cache_init(p, freq_scale, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
-            }
+            ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
 
             for (int64_t i1 = 0; i1 < ne1; i1++) {
                 if (ir++ < ir0) continue;
                 if (ir   > ir1) break;
 
-                float theta_base = (float)p;
-
-                if (is_glm) {
-                    theta_base = MIN(p, n_ctx - 2);
-                    float block_theta = MAX(p - (n_ctx - 2), 0);
-                    for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
-                        const float cos_theta = cosf(theta_base);
-                        const float sin_theta = sinf(theta_base) * sin_sign;
-                        const float cos_block_theta = cosf(block_theta);
-                        const float sin_block_theta = sinf(block_theta) * sin_sign;
-
-                        theta_base  *= theta_scale;
-                        block_theta *= theta_scale;
-
-                        const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                              ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data +  i3*nb3 + i2*nb2  + i1*nb1  + i0*nb0);
-
-                        const float x0 = GGML_FP16_TO_FP32(src[0]);
-                        const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
-                        const float x2 = GGML_FP16_TO_FP32(src[n_dims]);
-                        const float x3 = GGML_FP16_TO_FP32(src[n_dims/2*3]);
-
-                        dst_data[0]          = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
-                        dst_data[n_dims/2]   = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
-                        dst_data[n_dims]     = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
-                        dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
-                    }
-                } else if (!is_neox) {
-                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
+                if (!is_neox) {
+                    for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
                         const float cos_theta = cache[i0 + 0];
                         const float sin_theta = cache[i0 + 1];
 
@@ -14592,41 +14477,30 @@ static void ggml_compute_forward_rope_f16(
                         dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
                     }
                 } else {
-                    // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
-                    for (int64_t ic = 0; ic < ne0; ic += 2) {
-                        if (ic < n_dims) {
-                            const int64_t i0 = ic/2;
+                    for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
+                        const int64_t ic = i0/2;
 
-                            const float freq_factor = freq_factors ? freq_factors[i0] : 1.0f;
+                        const float cos_theta = cache[i0 + 0];
+                        const float sin_theta = cache[i0 + 1];
 
-                            float cos_theta, sin_theta;
-                            rope_yarn(
-                                theta_base/freq_factor, freq_scale, corr_dims, ic, ext_factor, attn_factor,
-                                &cos_theta, &sin_theta
-                            );
+                        const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
+                        ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
 
-                            sin_theta  *= sin_sign;
-                            theta_base *= theta_scale;
+                        const float x0 = GGML_FP16_TO_FP32(src[0]);
+                        const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
 
-                            const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                                  ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-
-                            const float x0 = GGML_FP16_TO_FP32(src[0]);
-                            const float x1 = GGML_FP16_TO_FP32(src[n_dims/2]);
-
-                            dst_data[0]        = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
-                            dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
-                        } else {
-                            const int64_t i0 = ic;
-
-                            const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                                  ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-
-                            dst_data[0] = src[0];
-                            dst_data[1] = src[1];
-                        }
+                        dst_data[0]        = GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
+                        dst_data[n_dims/2] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
                     }
                 }
+
+                for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
+                    const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                    ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+
+                    dst_data[0] = src[0];
+                    dst_data[1] = src[1];
+                }
             }
         }
     }
@@ -18327,9 +18201,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                     //const int n_past = ((int32_t *) tensor->op_params)[0];
                     const int n_dims     = ((int32_t *) tensor->op_params)[1];
                     const int mode       = ((int32_t *) tensor->op_params)[2];
-                    const int n_ctx      = ((int32_t *) tensor->op_params)[3];
-                    const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
-                    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
+                    //const int n_ctx      = ((int32_t *) tensor->op_params)[3];
+                    const int n_ctx_orig = ((int32_t *) tensor->op_params)[4];
+                    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
 
                     memcpy(&freq_base,   (int32_t *) tensor->op_params +  5, sizeof(float));
                     memcpy(&freq_scale,  (int32_t *) tensor->op_params +  6, sizeof(float));
@@ -18337,8 +18211,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                     memcpy(&attn_factor, (int32_t *) tensor->op_params +  8, sizeof(float));
                     memcpy(&beta_fast,   (int32_t *) tensor->op_params +  9, sizeof(float));
                     memcpy(&beta_slow,   (int32_t *) tensor->op_params + 10, sizeof(float));
-                    memcpy(&xpos_base,   (int32_t *) tensor->op_params + 11, sizeof(float));
-                    memcpy(&xpos_down,   (int32_t *) tensor->op_params + 12, sizeof(bool));
 
                     src0->grad = ggml_add_or_set(ctx,
                             src0->grad,
@@ -18348,16 +18220,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                 src2,
                                 n_dims,
                                 mode,
-                                n_ctx,
-                                n_orig_ctx,
+                                n_ctx_orig,
                                 freq_base,
                                 freq_scale,
                                 ext_factor,
                                 attn_factor,
                                 beta_fast,
-                                beta_slow,
-                                xpos_base,
-                                xpos_down),
+                                beta_slow),
                             zero_table);
                 }
             } break;
@@ -18367,9 +18236,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                     //const int n_past = ((int32_t *) tensor->op_params)[0];
                     const int n_dims     = ((int32_t *) tensor->op_params)[1];
                     const int mode       = ((int32_t *) tensor->op_params)[2];
-                    const int n_ctx      = ((int32_t *) tensor->op_params)[3];
-                    const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
-                    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
+                    //const int n_ctx      = ((int32_t *) tensor->op_params)[3];
+                    const int n_ctx_orig = ((int32_t *) tensor->op_params)[4];
+                    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
 
                     memcpy(&freq_base,   (int32_t *) tensor->op_params +  5, sizeof(float));
                     memcpy(&freq_scale,  (int32_t *) tensor->op_params +  6, sizeof(float));
@@ -18377,8 +18246,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                     memcpy(&attn_factor, (int32_t *) tensor->op_params +  8, sizeof(float));
                     memcpy(&beta_fast,   (int32_t *) tensor->op_params +  9, sizeof(float));
                     memcpy(&beta_slow,   (int32_t *) tensor->op_params + 10, sizeof(float));
-                    memcpy(&xpos_base,   (int32_t *) tensor->op_params + 11, sizeof(float));
-                    memcpy(&xpos_down,   (int32_t *) tensor->op_params + 12, sizeof(bool));
 
                     src0->grad = ggml_add_or_set(ctx,
                             src0->grad,
@@ -18388,16 +18255,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                 src2,
                                 n_dims,
                                 mode,
-                                n_ctx,
-                                n_orig_ctx,
+                                n_ctx_orig,
                                 freq_base,
                                 freq_scale,
                                 ext_factor,
                                 attn_factor,
                                 beta_fast,
                                 beta_slow,
-                                xpos_base,
-                                xpos_down,
                                 false),
                             zero_table);
                 }
diff --git a/ggml.h b/ggml.h
index addcf1bfe..13502a362 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1465,7 +1465,6 @@ extern "C" {
     // rotary position embedding
     // if mode & 1 == 1, skip n_past elements (NOT SUPPORTED)
     // if mode & 2 == 1, GPT-NeoX style
-    // if mode & 4 == 1, ChatGLM style
     //
     // b is an int32 vector with size a->ne[2], it contains the positions
     // c is freq factors (e.g. phi3-128k), (optional)
@@ -1474,8 +1473,7 @@ extern "C" {
             struct ggml_tensor  * a,
             struct ggml_tensor  * b,
             int                   n_dims,
-            int                   mode,
-            int                   n_ctx);
+            int                   mode);
 
     // in-place, returns view(a)
     GGML_API struct ggml_tensor * ggml_rope_inplace(
@@ -1483,8 +1481,7 @@ extern "C" {
             struct ggml_tensor  * a,
             struct ggml_tensor  * b,
             int                   n_dims,
-            int                   mode,
-            int                   n_ctx);
+            int                   mode);
 
     // custom RoPE
     GGML_API struct ggml_tensor * ggml_rope_ext(
@@ -1494,8 +1491,7 @@ extern "C" {
             struct ggml_tensor  * c,
             int                   n_dims,
             int                   mode,
-            int                   n_ctx,
-            int                   n_orig_ctx,
+            int                   n_ctx_orig,
             float                 freq_base,
             float                 freq_scale,
             float                 ext_factor,
@@ -1511,8 +1507,7 @@ extern "C" {
             struct ggml_tensor  * c,
             int                   n_dims,
             int                   mode,
-            int                   n_ctx,
-            int                   n_orig_ctx,
+            int                   n_ctx_orig,
             float                 freq_base,
             float                 freq_scale,
             float                 ext_factor,
@@ -1526,8 +1521,7 @@ extern "C" {
             struct ggml_tensor  * b,
             int                   n_dims,
             int                   mode,
-            int                   n_ctx,
-            int                   n_orig_ctx,
+            int                   n_ctx_orig,
             float                 freq_base,
             float                 freq_scale,
             float                 ext_factor,
@@ -1542,8 +1536,7 @@ extern "C" {
             struct ggml_tensor  * b,
             int                   n_dims,
             int                   mode,
-            int                   n_ctx,
-            int                   n_orig_ctx,
+            int                   n_ctx_orig,
             float                 freq_base,
             float                 freq_scale,
             float                 ext_factor,
@@ -1552,17 +1545,9 @@ extern "C" {
             float                 beta_slow),
         "use ggml_rope_ext_inplace instead");
 
-    struct ggml_tensor * ggml_rope_xpos_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        int                   n_dims,
-        float                 base,
-        bool                  down);
-
     // compute correction dims for YaRN RoPE scaling
     GGML_CALL void ggml_rope_yarn_corr_dims(
-        int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
+        int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]);
 
     // rotary position embedding backward, i.e compute dx from dy
     // a - dy
@@ -1573,16 +1558,13 @@ extern "C" {
             struct ggml_tensor  * c,
             int                   n_dims,
             int                   mode,
-            int                   n_ctx,
-            int                   n_orig_ctx,
+            int                   n_ctx_orig,
             float                 freq_base,
             float                 freq_scale,
             float                 ext_factor,
             float                 attn_factor,
             float                 beta_fast,
-            float                 beta_slow,
-            float                 xpos_base,
-            bool                  xpos_down);
+            float                 beta_slow);
 
     // clamp
     // in-place, returns view(a)
diff --git a/kompute-shaders/op_rope_f16.comp b/kompute-shaders/op_rope_f16.comp
index b44622584..1a4058b3f 100644
--- a/kompute-shaders/op_rope_f16.comp
+++ b/kompute-shaders/op_rope_f16.comp
@@ -14,7 +14,7 @@ void main() {
     const bool is_neox = (pcs.mode & 2) != 0;
 
     float corr_dims[2];
-    rope_yarn_corr_dims(pcs.n_dims, pcs.n_orig_ctx, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
+    rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
 
     const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
 
diff --git a/kompute-shaders/op_rope_f32.comp b/kompute-shaders/op_rope_f32.comp
index 2c0235d75..65e03827a 100644
--- a/kompute-shaders/op_rope_f32.comp
+++ b/kompute-shaders/op_rope_f32.comp
@@ -14,7 +14,7 @@ void main() {
     const bool is_neox = (pcs.mode & 2) != 0;
 
     float corr_dims[2];
-    rope_yarn_corr_dims(pcs.n_dims, pcs.n_orig_ctx, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
+    rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
 
     const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
 
diff --git a/kompute-shaders/rope_common.comp b/kompute-shaders/rope_common.comp
index 57ba6597a..7b9394cb2 100644
--- a/kompute-shaders/rope_common.comp
+++ b/kompute-shaders/rope_common.comp
@@ -9,7 +9,7 @@ layout (push_constant) uniform parameter {
     uint outOff;
     int n_dims;
     int mode;
-    int n_orig_ctx;
+    int n_ctx_orig;
     float freq_base;
     float freq_scale;
     float ext_factor;
@@ -54,14 +54,14 @@ void rope_yarn(
 
 // Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
 // `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
-float rope_yarn_corr_factor(int n_dims, int n_orig_ctx, float n_rot, float base) {
-    return n_dims * log(n_orig_ctx / (n_rot * TWOPI_F)) / (2 * log(base));
+float rope_yarn_corr_factor(int n_dims, int n_ctx_orig, float n_rot, float base) {
+    return n_dims * log(n_ctx_orig / (n_rot * TWOPI_F)) / (2 * log(base));
 }
 
 void rope_yarn_corr_dims(
-    int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, out float dims[2]
+    int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, out float dims[2]
 ) {
     // start and end correction dims
-    dims[0] = max(0.0f,         floor(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_fast, freq_base)));
-    dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_orig_ctx, beta_slow, freq_base)));
+    dims[0] = max(0.0f,         floor(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_fast, freq_base)));
+    dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_slow, freq_base)));
 }
diff --git a/llama.cpp b/llama.cpp
index 06889126e..414d390e8 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1848,7 +1848,7 @@ struct llama_hparams {
     float    rope_attn_factor = 1.0f;
     float    rope_freq_base_train;
     float    rope_freq_scale_train;
-    uint32_t n_yarn_orig_ctx;
+    uint32_t n_ctx_orig_yarn;
     float    rope_yarn_log_mul;
 
     // for State Space Models
@@ -1890,7 +1890,7 @@ struct llama_hparams {
         if (this->n_expert_shared    != other.n_expert_shared)    return true;
 
         if (this->rope_finetuned  != other.rope_finetuned)  return true;
-        if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
+        if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true;
 
         if (this->ssm_d_conv  != other.ssm_d_conv)  return true;
         if (this->ssm_d_inner != other.ssm_d_inner) return true;
@@ -1949,7 +1949,7 @@ struct llama_cparams {
     float rope_freq_base;
     float rope_freq_scale;
 
-    uint32_t n_yarn_orig_ctx;
+    uint32_t n_ctx_orig_yarn;
     // These hyperparameters are not exposed in GGUF, because all
     // existing YaRN models use the same values for them.
     float yarn_ext_factor;
@@ -4005,8 +4005,8 @@ static void llm_load_hparams(
     ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
     hparams.rope_finetuned = rope_finetuned;
 
-    hparams.n_yarn_orig_ctx = hparams.n_ctx_train;
-    ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_yarn_orig_ctx, false);
+    hparams.n_ctx_orig_yarn = hparams.n_ctx_train;
+    ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn, false);
 
     // rope_freq_base (optional)
     hparams.rope_freq_base_train = 10000.0f;
@@ -4968,7 +4968,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
     LLAMA_LOG_INFO("%s: rope scaling     = %s\n",     __func__, rope_scaling_type);
     LLAMA_LOG_INFO("%s: freq_base_train  = %.1f\n",   __func__, hparams.rope_freq_base_train);
     LLAMA_LOG_INFO("%s: freq_scale_train = %g\n",     __func__, hparams.rope_freq_scale_train);
-    LLAMA_LOG_INFO("%s: n_yarn_orig_ctx  = %u\n",     __func__, hparams.n_yarn_orig_ctx);
+    LLAMA_LOG_INFO("%s: n_ctx_orig_yarn  = %u\n",     __func__, hparams.n_ctx_orig_yarn);
     LLAMA_LOG_INFO("%s: rope_finetuned   = %s\n",     __func__, hparams.rope_finetuned ? "yes" : "unknown");
     LLAMA_LOG_INFO("%s: ssm_d_conv       = %u\n",     __func__, hparams.ssm_d_conv);
     LLAMA_LOG_INFO("%s: ssm_d_inner      = %u\n",     __func__, hparams.ssm_d_inner);
@@ -7134,7 +7134,7 @@ struct llm_build_context {
     const int32_t n_kv;     // size of KV cache to consider (n_kv <= kv_self.size)
     const int32_t n_outputs;
     const int32_t kv_head;  // index of where we store new KV data in the cache
-    const int32_t n_orig_ctx;
+    const int32_t n_ctx_orig;
 
     const bool flash_attn;
 
@@ -7183,7 +7183,7 @@ struct llm_build_context {
         n_kv             (worst_case ? kv_self.size : kv_self.n),
         n_outputs        (worst_case ? n_tokens : lctx.n_outputs),
         kv_head          (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
-        n_orig_ctx       (cparams.n_yarn_orig_ctx),
+        n_ctx_orig       (cparams.n_ctx_orig_yarn),
         flash_attn       (cparams.flash_attn),
         pooling_type     (cparams.pooling_type),
         rope_type        (hparams.rope_type),
@@ -7241,7 +7241,7 @@ struct llm_build_context {
                             ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
                             ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
                             0),
-                        lctx.inp_K_shift, rope_factors, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                        lctx.inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                         ext_factor, attn_factor, beta_fast, beta_slow);
 
             cb(tmp, "K_shifted", il);
@@ -7350,7 +7350,7 @@ struct llm_build_context {
         // choose long/short freq factors based on the context size
         const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
 
-        if (n_ctx_pre_seq > hparams.n_yarn_orig_ctx) {
+        if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) {
             return model.layers[il].rope_long;
         }
 
@@ -7466,14 +7466,14 @@ struct llm_build_context {
 
                 Qcur = ggml_rope_ext(
                     ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_ext(
                     ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -7597,12 +7597,12 @@ struct llm_build_context {
                     case MODEL_7B:
                         Qcur = ggml_rope_ext(
                             ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
-                            n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                            n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                             ext_factor, attn_factor, beta_fast, beta_slow
                         );
                         Kcur = ggml_rope_ext(
                             ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                            n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                            n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                             ext_factor, attn_factor, beta_fast, beta_slow
                         );
                         break;
@@ -7709,14 +7709,14 @@ struct llm_build_context {
 
                 Qcur = ggml_rope_ext(
                     ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_ext(
                     ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -7829,13 +7829,13 @@ struct llm_build_context {
 
                 // using mode = 2 for neox mode
                 Qcur = ggml_rope_ext(
-                    ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
+                    ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
                     freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_ext(
-                    ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
+                    ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
                     freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -7953,14 +7953,14 @@ struct llm_build_context {
 
                 Qcur = ggml_rope_ext(
                     ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_ext(
                     ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -8106,14 +8106,14 @@ struct llm_build_context {
 
                 Qcur = ggml_rope_ext(
                     ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_ext(
                     ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -8460,14 +8460,14 @@ struct llm_build_context {
 
                 Qcur = ggml_rope_ext(
                     ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_ext(
                     ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -8900,14 +8900,14 @@ struct llm_build_context {
 
                 Qcur = ggml_rope_ext(
                     ctx0, Qcur, inp_pos, nullptr,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_ext(
                     ctx0, Kcur, inp_pos, nullptr,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -9019,13 +9019,13 @@ struct llm_build_context {
 
                 // using mode = 2 for neox mode
                 Qcur = ggml_rope_ext(
-                    ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
+                    ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
                     freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_ext(
-                    ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
+                    ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
                     freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -9131,14 +9131,14 @@ struct llm_build_context {
 
                 Qcur = ggml_rope_ext(
                     ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_ext(
                     ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -9245,14 +9245,14 @@ struct llm_build_context {
 
                 Qcur = ggml_rope_ext(
                     ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_ext(
                     ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -9397,7 +9397,7 @@ struct llm_build_context {
                 Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
 
                 Qcur = ggml_rope_ext(
-                    ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
+                    ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
                     freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
@@ -9408,7 +9408,7 @@ struct llm_build_context {
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_ext(
-                    ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, 0, n_orig_ctx,
+                    ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig,
                     freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -9519,7 +9519,7 @@ struct llm_build_context {
                 Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
 
                 Qcur = ggml_rope_ext(
-                    ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
+                    ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig,
                     freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
@@ -9528,7 +9528,7 @@ struct llm_build_context {
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_ext(
-                    ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, 0, n_orig_ctx,
+                    ctx0, Kcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig,
                     freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -9636,13 +9636,13 @@ struct llm_build_context {
 
                 Qcur = ggml_rope_ext(
                         ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head,    n_tokens), inp_pos, nullptr,
-                        n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                        n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
                         ext_factor, attn_factor, beta_fast, beta_slow);
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_ext(
                         ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos, nullptr,
-                        n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                        n_embd_head, rope_type, n_ctx_orig, freq_base, freq_scale,
                         ext_factor, attn_factor, beta_fast, beta_slow);
                 cb(Kcur, "Kcur", il);
 
@@ -9844,14 +9844,14 @@ struct llm_build_context {
 
                 struct ggml_tensor * Qcur = ggml_rope_ext(
                     ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head,    n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 struct ggml_tensor * Kcur = ggml_rope_ext(
                     ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -9960,14 +9960,14 @@ struct llm_build_context {
 
                 Qcur = ggml_rope_ext(
                     ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_ext(
                     ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -10077,14 +10077,14 @@ struct llm_build_context {
 
                 Qcur = ggml_rope_ext(
                     ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_ext(
                     ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -10207,14 +10207,14 @@ struct llm_build_context {
 
                 Qcur = ggml_rope_ext(
                     ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_ext(
                     ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -10327,7 +10327,7 @@ struct llm_build_context {
 
                 Qcur = ggml_rope_ext(
                         ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head,    n_tokens), inp_pos, nullptr,
-                        n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                        n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
                         ext_factor, attn_factor, beta_fast, beta_slow);
                 cb(Qcur, "Qcur", il);
 
@@ -10336,7 +10336,7 @@ struct llm_build_context {
 
                 Kcur = ggml_rope_ext(
                         ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos, nullptr,
-                        n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                        n_embd_head_k, rope_type, n_ctx_orig, freq_base, freq_scale,
                         ext_factor, attn_factor, beta_fast, beta_slow);
                 cb(Kcur, "Kcur", il);
 
@@ -10447,14 +10447,14 @@ struct llm_build_context {
 
                 Qcur = ggml_rope_ext(
                     ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_ext(
                     ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -10737,14 +10737,14 @@ struct llm_build_context {
 
                 Qcur = ggml_rope_ext(
                     ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_ext(
                     ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -10868,14 +10868,14 @@ struct llm_build_context {
 
                 Qcur = ggml_rope_ext(
                     ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_ext(
                     ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -10982,14 +10982,14 @@ struct llm_build_context {
 
                 Qcur = ggml_rope_ext(
                     ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_ext(
                     ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -11117,14 +11117,14 @@ struct llm_build_context {
 
                 Qcur = ggml_rope_ext(
                     ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Qcur, "Qcur", il);
 
                 Kcur = ggml_rope_ext(
                     ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
                 cb(Kcur, "Kcur", il);
@@ -11334,7 +11334,7 @@ struct llm_build_context {
                 q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
                 q_pe = ggml_rope_ext(
                     ctx0, q_pe, inp_pos, nullptr,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor_scaled, beta_fast, beta_slow
                 );
                 cb(q_pe, "q_pe", il);
@@ -11343,7 +11343,7 @@ struct llm_build_context {
                 k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
                 k_pe = ggml_rope_ext(
                     ctx0, k_pe, inp_pos, nullptr,
-                    n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor_scaled, beta_fast, beta_slow
                 );
                 cb(k_pe, "k_pe", il);
@@ -16067,8 +16067,8 @@ struct llama_context * llama_new_context_with_model(
 
     cparams.n_ubatch         = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
 
-    cparams.n_yarn_orig_ctx  = params.yarn_orig_ctx    != 0 ? params.yarn_orig_ctx    :
-                               hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
+    cparams.n_ctx_orig_yarn  = params.yarn_orig_ctx    != 0 ? params.yarn_orig_ctx    :
+                               hparams.n_ctx_orig_yarn != 0 ? hparams.n_ctx_orig_yarn :
                                                               hparams.n_ctx_train;
 
     cparams.cb_eval           = params.cb_eval;
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 8dc90a45d..ce406a8af 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -1141,7 +1141,7 @@ struct test_rope : public test_case {
     const std::array<int64_t, 4> ne_a;
     int n_dims;
     int mode;
-    int n_ctx;
+    int n_ctx; // used to generate positions
     float fs; // freq_scale
     float ef; // ext_factor
     float af; // attn_factor
@@ -1168,7 +1168,7 @@ struct test_rope : public test_case {
         }
         ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2]);
         ggml_tensor * freq = ff ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_dims/2) : nullptr;
-        ggml_tensor * out = ggml_rope_ext(ctx, a, pos, freq, n_dims, mode, n_ctx, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
+        ggml_tensor * out = ggml_rope_ext(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
         return out;
     }
 
@@ -1615,7 +1615,7 @@ struct llama_hparams {
 
     // cparams
     static constexpr uint32_t n_ctx = 512; // user-specified context size
-    static constexpr uint32_t n_orig_ctx = n_ctx;
+    static constexpr uint32_t n_ctx_orig = n_ctx;
 
     // batch
     int32_t n_tokens;
@@ -1806,13 +1806,13 @@ struct test_llama : public test_llm {
 
                 Qcur = ggml_rope_ext(
                     ctx, ggml_reshape_3d(ctx, Qcur, hp.n_embd_head, hp.n_head,    hp.n_tokens), inp_pos, nullptr,
-                    hp.n_rot, 0, 0, hp.n_orig_ctx, freq_base, freq_scale,
+                    hp.n_rot, 0, hp.n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
 
                 Kcur = ggml_rope_ext(
                     ctx, ggml_reshape_3d(ctx, Kcur, hp.n_embd_head, hp.n_head_kv, hp.n_tokens), inp_pos, nullptr,
-                    hp.n_rot, 0, 0, hp.n_orig_ctx, freq_base, freq_scale,
+                    hp.n_rot, 0, hp.n_ctx_orig, freq_base, freq_scale,
                     ext_factor, attn_factor, beta_fast, beta_slow
                 );
 
@@ -1931,12 +1931,12 @@ struct test_falcon : public test_llm {
 
                 // using mode = 2 for neox mode
                 Qcur = ggml_rope_ext(
-                    ctx, Qcur, inp_pos, nullptr, hp.n_rot, 2, 0, hp.n_orig_ctx,
+                    ctx, Qcur, inp_pos, nullptr, hp.n_rot, 2, hp.n_ctx_orig,
                     freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
                 );
 
                 Kcur = ggml_rope_ext(
-                    ctx, Kcur, inp_pos, nullptr, hp.n_rot, 2, 0, hp.n_orig_ctx,
+                    ctx, Kcur, inp_pos, nullptr, hp.n_rot, 2, hp.n_ctx_orig,
                     freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
                 );
 
@@ -2236,15 +2236,15 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
                 for (float ef : { 0.0f, 0.7465f }) {
                     for (float af : { 1.0f, 1.4245f }) {
                         for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
-                            // TODO: ff not supported yet for !neox
-                            test_cases.emplace_back(new test_rope(type, {128,  32, 10, 1}, 128, 0, 512, fs, ef, af, false, v)); // llama 7B
-                            if (all) {
-                                test_cases.emplace_back(new test_rope(type, {128,  40, 10, 1}, 128, 0, 512, fs, ef, af, false, v)); // llama 13B
-                                test_cases.emplace_back(new test_rope(type, {128,  52, 10, 1}, 128, 0, 512, fs, ef, af, false, v)); // llama 30B
-                                test_cases.emplace_back(new test_rope(type, {128,  64, 10, 1}, 128, 0, 512, fs, ef, af, false, v)); // llama 65B
-                            }
-
                             for (bool ff : {false, true}) { // freq_factors
+                                test_cases.emplace_back(new test_rope(type, {128,  32, 10, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 7B
+
+                                if (all) {
+                                    test_cases.emplace_back(new test_rope(type, {128,  40, 10, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 13B
+                                    test_cases.emplace_back(new test_rope(type, {128,  52, 10, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 30B
+                                    test_cases.emplace_back(new test_rope(type, {128,  64, 10, 1}, 128, 0, 512, fs, ef, af, ff, v)); // llama 65B
+                                }
+
                                 if (all) {
                                     test_cases.emplace_back(new test_rope(type, { 64,   1, 10, 1},  64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 7B)
                                     test_cases.emplace_back(new test_rope(type, { 64,  71, 10, 1},  64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 7B)
@@ -2256,6 +2256,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
                                 test_cases.emplace_back(new test_rope(type, { 64, 128, 10, 1},  64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B)
                             }
                         }
+
                         all = false;
                     }
                 }
diff --git a/tests/test-grad0.cpp b/tests/test-grad0.cpp
index 21ca43be3..a35327645 100644
--- a/tests/test-grad0.cpp
+++ b/tests/test-grad0.cpp
@@ -1465,7 +1465,7 @@ int main(int argc, const char ** argv) {
                             continue;
                         }
 
-                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode, 0));
+                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode));
 
                         GGML_PRINT_DEBUG("rope f32: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
                         check_gradient("rope f32", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY);
@@ -1505,7 +1505,7 @@ int main(int argc, const char ** argv) {
                             continue;
                         }
 
-                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode, 0));
+                        struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], p, n_rot, mode));
 
                         GGML_PRINT_DEBUG("rope f16: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
                         check_gradient("rope f16", ctx0, x, f, ndims, nargs, 1e-1f, 1e-1f, INFINITY);
diff --git a/tests/test-rope.cpp b/tests/test-rope.cpp
index 26c1f42dc..f0895ffaa 100644
--- a/tests/test-rope.cpp
+++ b/tests/test-rope.cpp
@@ -162,12 +162,12 @@ int main(int /*argc*/, const char ** /*argv*/) {
         x = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
 
         // 100, 101, 102, ..., 172
-        struct ggml_tensor * r0 = ggml_rope(ctx0, x,  p0, n_rot, mode, 1024);
+        struct ggml_tensor * r0 = ggml_rope(ctx0, x,  p0, n_rot, mode);
         // -67, -67, -67, ..., -67
-        struct ggml_tensor * r1 = ggml_rope(ctx0, r0, p1, n_rot, mode, 1024); // "context swap", i.e. forget n_past_0 - n_past_2 tokens
+        struct ggml_tensor * r1 = ggml_rope(ctx0, r0, p1, n_rot, mode); // "context swap", i.e. forget n_past_0 - n_past_2 tokens
 
         //  33,  34,  35, ..., 105
-        struct ggml_tensor * r2 = ggml_rope(ctx0, x,  p2, n_rot, mode, 1024);
+        struct ggml_tensor * r2 = ggml_rope(ctx0, x,  p2, n_rot, mode);
 
         ggml_cgraph * gf = ggml_new_graph(ctx0);
 

From 7d1a378b8fb266782d9248538a661405aad80768 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Wed, 5 Jun 2024 16:53:00 +0200
Subject: [PATCH 12/39] CUDA: refactor mmq, dmmv, mmvq (#7716)

* CUDA: refactor mmq, dmmv, mmvq

* fix out-of-bounds write

* struct for qk, qr, qi

* fix cmake build

* mmq_type_traits
---
 CMakeLists.txt                                |    4 +
 Makefile                                      |    1 +
 ggml-common.h                                 |    6 +
 ggml-cuda.cu                                  |   84 +-
 ggml-cuda/common.cuh                          |  157 +-
 ggml-cuda/dmmv.cu                             |   30 +-
 ggml-cuda/mmq.cu                              | 1505 +----------------
 ggml-cuda/mmq.cuh                             | 1300 ++++++++++++++
 ggml-cuda/mmvq.cu                             |  137 +-
 .../fattn-vec-f16-instance-hs128-f16-f16.cu   |    2 +-
 .../fattn-vec-f16-instance-hs128-f16-q4_0.cu  |    2 +-
 .../fattn-vec-f16-instance-hs128-f16-q4_1.cu  |    2 +-
 .../fattn-vec-f16-instance-hs128-f16-q5_0.cu  |    2 +-
 .../fattn-vec-f16-instance-hs128-f16-q5_1.cu  |    2 +-
 .../fattn-vec-f16-instance-hs128-f16-q8_0.cu  |    2 +-
 .../fattn-vec-f16-instance-hs128-q4_0-f16.cu  |    2 +-
 .../fattn-vec-f16-instance-hs128-q4_0-q4_0.cu |    2 +-
 .../fattn-vec-f16-instance-hs128-q4_0-q4_1.cu |    2 +-
 .../fattn-vec-f16-instance-hs128-q4_0-q5_0.cu |    2 +-
 .../fattn-vec-f16-instance-hs128-q4_0-q5_1.cu |    2 +-
 .../fattn-vec-f16-instance-hs128-q4_0-q8_0.cu |    2 +-
 .../fattn-vec-f16-instance-hs128-q4_1-f16.cu  |    2 +-
 .../fattn-vec-f16-instance-hs128-q4_1-q4_0.cu |    2 +-
 .../fattn-vec-f16-instance-hs128-q4_1-q4_1.cu |    2 +-
 .../fattn-vec-f16-instance-hs128-q4_1-q5_0.cu |    2 +-
 .../fattn-vec-f16-instance-hs128-q4_1-q5_1.cu |    2 +-
 .../fattn-vec-f16-instance-hs128-q4_1-q8_0.cu |    2 +-
 .../fattn-vec-f16-instance-hs128-q5_0-f16.cu  |    2 +-
 .../fattn-vec-f16-instance-hs128-q5_0-q4_0.cu |    2 +-
 .../fattn-vec-f16-instance-hs128-q5_0-q4_1.cu |    2 +-
 .../fattn-vec-f16-instance-hs128-q5_0-q5_0.cu |    2 +-
 .../fattn-vec-f16-instance-hs128-q5_0-q5_1.cu |    2 +-
 .../fattn-vec-f16-instance-hs128-q5_0-q8_0.cu |    2 +-
 .../fattn-vec-f16-instance-hs128-q5_1-f16.cu  |    2 +-
 .../fattn-vec-f16-instance-hs128-q5_1-q4_0.cu |    2 +-
 .../fattn-vec-f16-instance-hs128-q5_1-q4_1.cu |    2 +-
 .../fattn-vec-f16-instance-hs128-q5_1-q5_0.cu |    2 +-
 .../fattn-vec-f16-instance-hs128-q5_1-q5_1.cu |    2 +-
 .../fattn-vec-f16-instance-hs128-q5_1-q8_0.cu |    2 +-
 .../fattn-vec-f16-instance-hs128-q8_0-f16.cu  |    2 +-
 .../fattn-vec-f16-instance-hs128-q8_0-q4_0.cu |    2 +-
 .../fattn-vec-f16-instance-hs128-q8_0-q4_1.cu |    2 +-
 .../fattn-vec-f16-instance-hs128-q8_0-q5_0.cu |    2 +-
 .../fattn-vec-f16-instance-hs128-q8_0-q5_1.cu |    2 +-
 .../fattn-vec-f16-instance-hs128-q8_0-q8_0.cu |    2 +-
 .../fattn-vec-f16-instance-hs256-f16-f16.cu   |    2 +-
 .../fattn-vec-f16-instance-hs64-f16-f16.cu    |    2 +-
 .../fattn-vec-f16-instance-hs64-f16-q4_0.cu   |    2 +-
 .../fattn-vec-f16-instance-hs64-f16-q4_1.cu   |    2 +-
 .../fattn-vec-f16-instance-hs64-f16-q5_0.cu   |    2 +-
 .../fattn-vec-f16-instance-hs64-f16-q5_1.cu   |    2 +-
 .../fattn-vec-f16-instance-hs64-f16-q8_0.cu   |    2 +-
 .../fattn-vec-f32-instance-hs128-f16-f16.cu   |    2 +-
 .../fattn-vec-f32-instance-hs128-f16-q4_0.cu  |    2 +-
 .../fattn-vec-f32-instance-hs128-f16-q4_1.cu  |    2 +-
 .../fattn-vec-f32-instance-hs128-f16-q5_0.cu  |    2 +-
 .../fattn-vec-f32-instance-hs128-f16-q5_1.cu  |    2 +-
 .../fattn-vec-f32-instance-hs128-f16-q8_0.cu  |    2 +-
 .../fattn-vec-f32-instance-hs128-q4_0-f16.cu  |    2 +-
 .../fattn-vec-f32-instance-hs128-q4_0-q4_0.cu |    2 +-
 .../fattn-vec-f32-instance-hs128-q4_0-q4_1.cu |    2 +-
 .../fattn-vec-f32-instance-hs128-q4_0-q5_0.cu |    2 +-
 .../fattn-vec-f32-instance-hs128-q4_0-q5_1.cu |    2 +-
 .../fattn-vec-f32-instance-hs128-q4_0-q8_0.cu |    2 +-
 .../fattn-vec-f32-instance-hs128-q4_1-f16.cu  |    2 +-
 .../fattn-vec-f32-instance-hs128-q4_1-q4_0.cu |    2 +-
 .../fattn-vec-f32-instance-hs128-q4_1-q4_1.cu |    2 +-
 .../fattn-vec-f32-instance-hs128-q4_1-q5_0.cu |    2 +-
 .../fattn-vec-f32-instance-hs128-q4_1-q5_1.cu |    2 +-
 .../fattn-vec-f32-instance-hs128-q4_1-q8_0.cu |    2 +-
 .../fattn-vec-f32-instance-hs128-q5_0-f16.cu  |    2 +-
 .../fattn-vec-f32-instance-hs128-q5_0-q4_0.cu |    2 +-
 .../fattn-vec-f32-instance-hs128-q5_0-q4_1.cu |    2 +-
 .../fattn-vec-f32-instance-hs128-q5_0-q5_0.cu |    2 +-
 .../fattn-vec-f32-instance-hs128-q5_0-q5_1.cu |    2 +-
 .../fattn-vec-f32-instance-hs128-q5_0-q8_0.cu |    2 +-
 .../fattn-vec-f32-instance-hs128-q5_1-f16.cu  |    2 +-
 .../fattn-vec-f32-instance-hs128-q5_1-q4_0.cu |    2 +-
 .../fattn-vec-f32-instance-hs128-q5_1-q4_1.cu |    2 +-
 .../fattn-vec-f32-instance-hs128-q5_1-q5_0.cu |    2 +-
 .../fattn-vec-f32-instance-hs128-q5_1-q5_1.cu |    2 +-
 .../fattn-vec-f32-instance-hs128-q5_1-q8_0.cu |    2 +-
 .../fattn-vec-f32-instance-hs128-q8_0-f16.cu  |    2 +-
 .../fattn-vec-f32-instance-hs128-q8_0-q4_0.cu |    2 +-
 .../fattn-vec-f32-instance-hs128-q8_0-q4_1.cu |    2 +-
 .../fattn-vec-f32-instance-hs128-q8_0-q5_0.cu |    2 +-
 .../fattn-vec-f32-instance-hs128-q8_0-q5_1.cu |    2 +-
 .../fattn-vec-f32-instance-hs128-q8_0-q8_0.cu |    2 +-
 .../fattn-vec-f32-instance-hs256-f16-f16.cu   |    2 +-
 .../fattn-vec-f32-instance-hs64-f16-f16.cu    |    2 +-
 .../fattn-vec-f32-instance-hs64-f16-q4_0.cu   |    2 +-
 .../fattn-vec-f32-instance-hs64-f16-q4_1.cu   |    2 +-
 .../fattn-vec-f32-instance-hs64-f16-q5_0.cu   |    2 +-
 .../fattn-vec-f32-instance-hs64-f16-q5_1.cu   |    2 +-
 .../fattn-vec-f32-instance-hs64-f16-q8_0.cu   |    2 +-
 .../fattn-wmma-f16-instance-kqfloat-cpb16.cu  |    2 +-
 .../fattn-wmma-f16-instance-kqfloat-cpb32.cu  |    2 +-
 .../fattn-wmma-f16-instance-kqhalf-cpb16.cu   |    2 +-
 .../fattn-wmma-f16-instance-kqhalf-cpb32.cu   |    2 +-
 .../fattn-wmma-f16-instance-kqhalf-cpb8.cu    |    2 +-
 .../template-instances/generate_cu_files.py   |   16 +
 .../template-instances/mmq-instance-q2_k.cu   |    5 +
 .../template-instances/mmq-instance-q3_k.cu   |    5 +
 .../template-instances/mmq-instance-q4_0.cu   |    5 +
 .../template-instances/mmq-instance-q4_1.cu   |    5 +
 .../template-instances/mmq-instance-q4_k.cu   |    5 +
 .../template-instances/mmq-instance-q5_0.cu   |    5 +
 .../template-instances/mmq-instance-q5_1.cu   |    5 +
 .../template-instances/mmq-instance-q5_k.cu   |    5 +
 .../template-instances/mmq-instance-q6_k.cu   |    5 +
 .../template-instances/mmq-instance-q8_0.cu   |    5 +
 ggml-cuda/vecdotq.cuh                         |   78 +-
 112 files changed, 1783 insertions(+), 1767 deletions(-)
 create mode 100644 ggml-cuda/template-instances/mmq-instance-q2_k.cu
 create mode 100644 ggml-cuda/template-instances/mmq-instance-q3_k.cu
 create mode 100644 ggml-cuda/template-instances/mmq-instance-q4_0.cu
 create mode 100644 ggml-cuda/template-instances/mmq-instance-q4_1.cu
 create mode 100644 ggml-cuda/template-instances/mmq-instance-q4_k.cu
 create mode 100644 ggml-cuda/template-instances/mmq-instance-q5_0.cu
 create mode 100644 ggml-cuda/template-instances/mmq-instance-q5_1.cu
 create mode 100644 ggml-cuda/template-instances/mmq-instance-q5_k.cu
 create mode 100644 ggml-cuda/template-instances/mmq-instance-q6_k.cu
 create mode 100644 ggml-cuda/template-instances/mmq-instance-q8_0.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cf37d5bb2..b1d6afbbc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -416,6 +416,8 @@ if (LLAMA_CUDA)
         list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
         file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
         list(APPEND GGML_SOURCES_CUDA ${SRCS})
+        file(GLOB SRCS "ggml-cuda/template-instances/mmq*.cu")
+        list(APPEND GGML_SOURCES_CUDA ${SRCS})
 
         add_compile_definitions(GGML_USE_CUDA)
         add_compile_definitions(GGML_CUDA_USE_GRAPHS)
@@ -588,6 +590,8 @@ if (LLAMA_HIPBLAS)
     list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")
     file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
     list(APPEND GGML_SOURCES_ROCM ${SRCS})
+    file(GLOB SRCS "ggml-cuda/template-instances/mmq*.cu")
+    list(APPEND GGML_SOURCES_ROCM ${SRCS})
 
     add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA)
 
diff --git a/Makefile b/Makefile
index 802ee6a47..895c62f84 100644
--- a/Makefile
+++ b/Makefile
@@ -444,6 +444,7 @@ ifdef LLAMA_CUBLAS
 endif
 
 OBJS_CUDA_TEMP_INST      = $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-wmma*.cu))
+OBJS_CUDA_TEMP_INST     += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/mmq*.cu))
 ifdef LLAMA_CUDA_FA_ALL_QUANTS
 	OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*.cu))
 else
diff --git a/ggml-common.h b/ggml-common.h
index 77e6bfba4..e8efceb76 100644
--- a/ggml-common.h
+++ b/ggml-common.h
@@ -123,12 +123,18 @@ typedef sycl::half2 ggml_half2;
 #define QI1_S (QK_K / (4*QR1_S))
 #define QR1_S 8
 
+#define QI1_M (QK_K / (4*QR1_M))
+#define QR1_M 8
+
 #define QI4_NL (QK4_NL / (4*QR4_NL))
 #define QR4_NL 2
 
 #define QI4_XS (QK_K / (4*QR4_XS))
 #define QR4_XS 8
 
+#define QI3_S (QK_K / (4*QR3_S))
+#define QR3_S 8
+
 #endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
 
 #define QK4_0 32
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index c81c6a0d7..dad8a9e2d 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -633,88 +633,22 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
 
 // cuda split buffer
 
-static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
-    int64_t min_compute_capability = INT_MAX;
-    int64_t max_compute_capability = INT_MIN;
+static int64_t get_row_rounding(const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
+    int64_t row_rounding = 0;
     for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
-        if (tensor_split[id] < (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
-            if (min_compute_capability > ggml_cuda_info().devices[id].cc) {
-                min_compute_capability = ggml_cuda_info().devices[id].cc;
-            }
-            if (max_compute_capability < ggml_cuda_info().devices[id].cc) {
-                max_compute_capability = ggml_cuda_info().devices[id].cc;
-            }
+        if (tensor_split[id] >= (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) {
+            continue;
         }
-    }
 
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-    switch(type) {
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-            return max_compute_capability >= CC_RDNA2 ? 128 : 64;
-        case GGML_TYPE_F16:
-        case GGML_TYPE_F32:
-            return 1;
-        case GGML_TYPE_Q2_K:
-            return max_compute_capability >= CC_RDNA2 ? 128 : 32;
-        case GGML_TYPE_Q3_K:
-            return min_compute_capability < CC_RDNA2 ? 128 : 64;
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
-        case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_IQ3_XXS:
-        case GGML_TYPE_IQ1_S:
-        case GGML_TYPE_IQ1_M:
-        case GGML_TYPE_IQ4_NL:
-        case GGML_TYPE_IQ4_XS:
-        case GGML_TYPE_IQ3_S:
-            return max_compute_capability >= CC_RDNA2 ? 128 : 64;
-        default:
-            GGML_ASSERT(false);
+        const int cc = ggml_cuda_info().devices[id].cc;
+        row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc, get_mmq_x_max_host(cc)));
     }
-#else
-    switch(type) {
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-            return max_compute_capability >= CC_VOLTA ? 128 : 64;
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-            return 64;
-        case GGML_TYPE_F16:
-        case GGML_TYPE_F32:
-            return 1;
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
-        case GGML_TYPE_IQ2_S:
-        case GGML_TYPE_IQ3_XXS:
-        case GGML_TYPE_IQ1_S:
-        case GGML_TYPE_IQ1_M:
-        case GGML_TYPE_IQ4_NL:
-        case GGML_TYPE_IQ4_XS:
-        case GGML_TYPE_IQ3_S:
-            return max_compute_capability >= CC_VOLTA ? 128 : 64;
-        case GGML_TYPE_Q6_K:
-            return 64;
-        default:
-            GGML_ASSERT(false);
-    }
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+    return row_rounding;
 }
 
 static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split, int id) {
     const int64_t nrows = ggml_nrows(tensor);
-    const int64_t rounding = get_row_rounding(tensor->type, tensor_split);
+    const int64_t rounding = get_row_rounding(tensor_split);
 
     *row_low = id == 0 ? 0 : nrows*tensor_split[id];
     *row_low -= *row_low % rounding;
@@ -1499,7 +1433,7 @@ static void ggml_cuda_op_mul_mat(
         // for multi GPU, get the row boundaries from tensor split
         // and round to mul_mat_q tile sizes
         if (split) {
-            const int64_t rounding = get_row_rounding(src0->type, tensor_split);
+            const int64_t rounding = get_row_rounding(tensor_split);
 
             if (id != 0) {
                 dev[id].row_low  = ne01*tensor_split[id];
diff --git a/ggml-cuda/common.cuh b/ggml-cuda/common.cuh
index 22872ca5c..90a0a81ea 100644
--- a/ggml-cuda/common.cuh
+++ b/ggml-cuda/common.cuh
@@ -160,7 +160,7 @@
 #endif
 
 #define MMVQ_MAX_BATCH_SIZE  8 // max batch size to use MMVQ kernels
-#define  MMQ_MAX_BATCH_SIZE 32 // max batch size to use MMQ kernels when tensor cores are available
+#define  MMQ_MAX_BATCH_SIZE 64 // max batch size to use MMQ kernels when tensor cores are available
 
 #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
 
@@ -484,6 +484,161 @@ static __device__ __forceinline__ float get_alibi_slope(
     return powf(base, exph);
 }
 
+template <ggml_type type>
+struct ggml_cuda_type_traits;
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_F16> {
+    static constexpr int qk = 1;
+    static constexpr int qr = 1;
+};
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_Q4_0> {
+    static constexpr int qk = QK4_0;
+    static constexpr int qr = QR4_0;
+    static constexpr int qi = QI4_0;
+};
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_Q4_1> {
+    static constexpr int qk = QK4_1;
+    static constexpr int qr = QR4_1;
+    static constexpr int qi = QI4_1;
+};
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_Q5_0> {
+    static constexpr int qk = QK5_0;
+    static constexpr int qr = QR5_0;
+    static constexpr int qi = QI5_0;
+};
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_Q5_1> {
+    static constexpr int qk = QK5_1;
+    static constexpr int qr = QR5_1;
+    static constexpr int qi = QI5_1;
+};
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_Q8_0> {
+    static constexpr int qk = QK8_0;
+    static constexpr int qr = QR8_0;
+    static constexpr int qi = QI8_0;
+};
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_Q2_K> {
+    static constexpr int qk = QK_K;
+    static constexpr int qr = QR2_K;
+    static constexpr int qi = QI2_K;
+};
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_Q3_K> {
+    static constexpr int qk = QK_K;
+    static constexpr int qr = QR3_K;
+    static constexpr int qi = QI3_K;
+};
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_Q4_K> {
+    static constexpr int qk = QK_K;
+    static constexpr int qr = QR4_K;
+    static constexpr int qi = QI4_K;
+};
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_Q5_K> {
+    static constexpr int qk = QK_K;
+    static constexpr int qr = QR5_K;
+    static constexpr int qi = QI5_K;
+};
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_Q6_K> {
+    static constexpr int qk = QK_K;
+    static constexpr int qr = QR6_K;
+    static constexpr int qi = QI6_K;
+};
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_IQ2_XXS> {
+    static constexpr int qk = QK_K;
+    static constexpr int qr = QR2_XXS;
+    static constexpr int qi = QI2_XXS;
+};
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_IQ2_XS> {
+    static constexpr int qk = QK_K;
+    static constexpr int qr = QR2_XS;
+    static constexpr int qi = QI2_XS;
+};
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_IQ2_S> {
+    static constexpr int qk = QK_K;
+    static constexpr int qr = QR2_S;
+    static constexpr int qi = QI2_S;
+};
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_IQ3_XXS> {
+    static constexpr int qk = QK_K;
+    static constexpr int qr = QR3_XXS;
+    static constexpr int qi = QI3_XXS;
+};
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_IQ1_S> {
+    static constexpr int qk = QK_K;
+    static constexpr int qr = QR1_S;
+    static constexpr int qi = QI1_S;
+};
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_IQ1_M> {
+    static constexpr int qk = QK_K;
+    static constexpr int qr = QR1_M;
+    static constexpr int qi = QI1_M;
+};
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_IQ4_NL> {
+    static constexpr int qk = QK4_NL;
+    static constexpr int qr = QR4_NL;
+    static constexpr int qi = QI4_NL;
+};
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_IQ4_XS> {
+    static constexpr int qk = QK_K;
+    static constexpr int qr = QR4_XS;
+    static constexpr int qi = QI4_XS;
+};
+
+template<>
+struct ggml_cuda_type_traits<GGML_TYPE_IQ3_S> {
+    static constexpr int qk = QK_K;
+    static constexpr int qr = QR3_S;
+    static constexpr int qi = QI3_S;
+};
+
+static int get_mmq_x_max_host(const int cc) {
+#ifdef CUDA_USE_TENSOR_CORES
+    return cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? MMQ_MAX_BATCH_SIZE : 64;
+#else
+    return cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? 128 : 64;
+#endif // CUDA_USE_TENSOR_CORES
+}
+
+// Round rows to this value for --split-mode row:
+static int get_mmq_y_host(const int cc, const int mmq_x) {
+    return cc >= CC_VOLTA && mmq_x >= 32 ? 128 : 64;
+}
+
 //////////////////////
 
 struct ggml_cuda_device_info {
diff --git a/ggml-cuda/dmmv.cu b/ggml-cuda/dmmv.cu
index 47d4d5d9e..174489e06 100644
--- a/ggml-cuda/dmmv.cu
+++ b/ggml-cuda/dmmv.cu
@@ -422,10 +422,22 @@ static __device__ void convert_f16(const void * vx, const int64_t ib, const int
     v.y = x[ib + iqs + 1];
 }
 
-template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
+static constexpr __device__ dequantize_kernel_t get_dequantize_kernel(ggml_type type) {
+    return type == GGML_TYPE_Q4_0 ? dequantize_q4_0 :
+        type == GGML_TYPE_Q4_1 ? dequantize_q4_1 :
+        type == GGML_TYPE_Q5_0 ? dequantize_q5_0 :
+        type == GGML_TYPE_Q5_1 ? dequantize_q5_1 :
+        type == GGML_TYPE_Q8_0 ? dequantize_q8_0 :
+        type == GGML_TYPE_F16 ? convert_f16 :
+        nullptr;
+}
+
+template <ggml_type type>
 static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
-    // qk = quantized weights per x block
-    // qr = number of quantized weights per data value in x block
+    constexpr int qk = ggml_cuda_type_traits<type>::qk; // quantized weights per x block
+    constexpr int qr = ggml_cuda_type_traits<type>::qr; // number of quantized weights per data value in x block
+    constexpr dequantize_kernel_t dequantize_kernel = get_dequantize_kernel(type);
+
     const int64_t row = (int64_t)blockIdx.x*blockDim.y + threadIdx.y;
 
     if (row >= nrows) {
@@ -493,7 +505,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y,
     // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
+    dequantize_mul_mat_vec<GGML_TYPE_Q4_0>
         <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
 }
 
@@ -502,7 +514,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y,
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
+    dequantize_mul_mat_vec<GGML_TYPE_Q4_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
 }
 
@@ -511,7 +523,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y,
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
+    dequantize_mul_mat_vec<GGML_TYPE_Q5_0>
         <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
 }
 
@@ -520,7 +532,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y,
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
+    dequantize_mul_mat_vec<GGML_TYPE_Q5_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
 }
 
@@ -529,7 +541,7 @@ static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y,
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
+    dequantize_mul_mat_vec<GGML_TYPE_Q8_0>
         <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
 }
 
@@ -580,7 +592,7 @@ static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, floa
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(block_num_y, 1, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<1, 1, convert_f16>
+    dequantize_mul_mat_vec<GGML_TYPE_F16>
         <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
 }
 
diff --git a/ggml-cuda/mmq.cu b/ggml-cuda/mmq.cu
index ebe1dc5c8..58799e4ca 100644
--- a/ggml-cuda/mmq.cu
+++ b/ggml-cuda/mmq.cu
@@ -1,1450 +1,4 @@
 #include "mmq.cuh"
-#include "vecdotq.cuh"
-
-typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
-typedef void (*load_tiles_cuda_t)(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row);
-typedef float (*vec_dot_q_mul_mat_cuda_t)(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k);
-typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
-typedef void (mul_mat_q_t)(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst);
-
-struct mmq_arch_config_t {
-    int x;
-    int y;
-    int nwarps;
-};
-
-struct mmq_config_t {
-    mmq_arch_config_t rdna2;
-    mmq_arch_config_t rdna1;
-    mmq_arch_config_t ampere;
-    mmq_arch_config_t pascal;
-};
-
-constexpr mmq_config_t MMQ_CONFIG_Q4_0 = {
-//        x    y  nwarps
-        { 64, 128, 8},
-        { 64,  64, 8},
-#ifdef CUDA_USE_TENSOR_CORES
-        {  4,  32, 4},
-#else
-        { 64, 128, 4},
-#endif // CUDA_USE_TENSOR_CORES
-        { 64,  64, 8},
-};
-constexpr mmq_config_t MMQ_CONFIG_Q4_1 = {
-//        x    y  nwarps
-        { 64, 128, 8},
-        { 64,  64, 8},
-#ifdef CUDA_USE_TENSOR_CORES
-        {  4,  32, 4},
-#else
-        { 64, 128, 4},
-#endif // CUDA_USE_TENSOR_CORES
-        { 64,  64, 8},
-};
-constexpr mmq_config_t MMQ_CONFIG_Q5_0 = {
-//        x    y  nwarps
-        { 64, 128, 8},
-        { 64,  64, 8},
-#ifdef CUDA_USE_TENSOR_CORES
-        {  4,  32, 4},
-#else
-        {128,  64, 4},
-#endif // CUDA_USE_TENSOR_CORES
-        { 64,  64, 8},
-};
-constexpr mmq_config_t MMQ_CONFIG_Q5_1 = {
-//        x    y  nwarps
-        { 64, 128, 8},
-        { 64,  64, 8},
-#ifdef CUDA_USE_TENSOR_CORES
-        {  4,  32, 4},
-#else
-        {128,  64, 4},
-#endif // CUDA_USE_TENSOR_CORES
-        { 64,  64, 8},
-};
-constexpr mmq_config_t MMQ_CONFIG_Q8_0 = {
-//        x    y  nwarps
-        { 64, 128, 8},
-        { 64,  64, 8},
-#ifdef CUDA_USE_TENSOR_CORES
-        {  4,  32, 4},
-#else
-        {128,  64, 4},
-#endif // CUDA_USE_TENSOR_CORES
-        { 64,  64, 8},
-};
-constexpr mmq_config_t MMQ_CONFIG_Q2_K = {
-//        x    y  nwarps
-        { 64, 128, 8},
-        {128,  32, 8},
-#ifdef CUDA_USE_TENSOR_CORES
-        {  4,  32, 4},
-#else
-        { 64, 128, 4},
-#endif // CUDA_USE_TENSOR_CORES
-        { 64,  64, 8},
-};
-constexpr mmq_config_t MMQ_CONFIG_Q3_K = {
-//        x    y  nwarps
-        {128,  64, 8},
-        { 32, 128, 8},
-#ifdef CUDA_USE_TENSOR_CORES
-        {  4,  32, 4},
-#else
-        {128, 128, 4},
-#endif // CUDA_USE_TENSOR_CORES
-        { 64,  64, 8},
-};
-constexpr mmq_config_t MMQ_CONFIG_Q4_K = {
-//        x    y  nwarps
-        { 64, 128, 8},
-        { 32,  64, 8},
-#ifdef CUDA_USE_TENSOR_CORES
-        {  4,  32, 4},
-#else
-        { 64, 128, 4},
-#endif // CUDA_USE_TENSOR_CORES
-        { 64,  64, 8},
-};
-constexpr mmq_config_t MMQ_CONFIG_Q5_K = {
-//        x    y  nwarps
-        { 64, 128, 8},
-        { 32,  64, 8},
-#ifdef CUDA_USE_TENSOR_CORES
-        {  4,  32, 4},
-#else
-        { 64, 128, 4},
-#endif // CUDA_USE_TENSOR_CORES
-        { 64,  64, 8},
-};
-constexpr mmq_config_t MMQ_CONFIG_Q6_K = {
-//        x    y  nwarps
-        { 64, 128, 8},
-        { 32,  64, 8},
-#ifdef CUDA_USE_TENSOR_CORES
-        {  4,  32, 4},
-#else
-        { 64,  64, 4},
-#endif // CUDA_USE_TENSOR_CORES
-        { 64,  64, 8},
-};
-
-// ------------------------------------------------------------
-
-template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    GGML_UNUSED(x_qh);
-    GGML_UNUSED(x_sc);
-
-    __shared__ int  tile_x_qs[mmq_y * (WARP_SIZE)       + mmq_y];
-    __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
-
-    *x_ql = tile_x_qs;
-    *x_dm = (half2 *) tile_x_d;
-}
-
-template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI4_0;
-    const int kqsx = k % QI4_0;
-
-    const block_q4_0 * bx0 = (const block_q4_0 *) vx;
-
-    float * x_dmf = (float *) x_dm;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
-
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
-        // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
-    const int kbxd = k % blocks_per_tile_x_row;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
-        int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d;
-    }
-}
-
-static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
-
-    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
-    const float * x_dmf = (const float *) x_dm;
-
-    int u[2*VDR_Q4_0_Q8_1_MMQ];
-
-#pragma unroll
-    for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
-        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
-        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
-    }
-
-    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
-        (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
-         y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
-}
-
-template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
-
-    __shared__ int   tile_x_qs[mmq_y * (WARP_SIZE) +     + mmq_y];
-    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
-
-    *x_ql = tile_x_qs;
-    *x_dm = tile_x_dm;
-}
-
-template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI4_1;
-    const int kqsx = k % QI4_1;
-
-    const block_q4_1 * bx0 = (const block_q4_1 *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx;
-
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
-    const int kbxd = k % blocks_per_tile_x_row;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
-        int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
-    }
-}
-
-static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
-
-    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
-
-    int u[2*VDR_Q4_1_Q8_1_MMQ];
-
-#pragma unroll
-    for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
-        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
-        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
-    }
-
-    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
-        (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
-         y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
-}
-
-template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
-
-    __shared__ int  tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
-    __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
-
-    *x_ql = tile_x_ql;
-    *x_dm = (half2 *) tile_x_d;
-}
-
-template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI5_0;
-    const int kqsx = k % QI5_0;
-
-    const block_q5_0 * bx0 = (const block_q5_0 *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx;
-
-        const int ql = get_int_from_uint8(bxi->qs, kqsx);
-        const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0));
-
-        int qs0 = (ql >>  0)   & 0x0F0F0F0F;
-        qs0    |= (qh <<  4)   & 0x00000010;  // 0 ->  4
-        qs0    |= (qh << 11)   & 0x00001000;  // 1 -> 12
-        qs0    |= (qh << 18)   & 0x00100000;  // 2 -> 20
-        qs0    |= (qh << 25)   & 0x10000000;  // 3 -> 28
-        qs0     = __vsubss4(qs0, 0x10101010); // subtract 16
-
-        x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
-
-        int qs1 = (ql >>  4)   & 0x0F0F0F0F;
-        qs1    |= (qh >> 12)   & 0x00000010;  // 16 ->  4
-        qs1    |= (qh >>  5)   & 0x00001000;  // 17 -> 12
-        qs1    |= (qh <<  2)   & 0x00100000;  // 18 -> 20
-        qs1    |= (qh <<  9)   & 0x10000000;  // 19 -> 28
-        qs1     = __vsubss4(qs1, 0x10101010); // subtract 16
-
-        x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI5_0;
-    const int kbxd = k % blocks_per_tile_x_row;
-    float * x_dmf = (float *) x_dm;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
-        int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d;
-    }
-}
-
-static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
-
-    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
-    const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
-    const float * x_dmf = (const float *) x_dm;
-    const float * y_df  = (const float *) y_ds;
-
-    int u[2*VDR_Q5_0_Q8_1_MMQ];
-
-#pragma unroll
-    for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
-        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
-        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
-    }
-
-    return vec_dot_q8_0_q8_1_impl<float, QR5_0*VDR_Q5_0_Q8_1_MMQ>
-        (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
-}
-
-
-template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
-
-    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
-    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
-
-    *x_ql = tile_x_ql;
-    *x_dm = tile_x_dm;
-}
-
-template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset < nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI5_1;
-    const int kqsx = k % QI5_1;
-
-    const block_q5_1 * bx0 = (const block_q5_1 *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx;
-
-        const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
-        const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1));
-
-        int qs0 = (ql >>  0) & 0x0F0F0F0F;
-        qs0    |= (qh <<  4) & 0x00000010; // 0 ->  4
-        qs0    |= (qh << 11) & 0x00001000; // 1 -> 12
-        qs0    |= (qh << 18) & 0x00100000; // 2 -> 20
-        qs0    |= (qh << 25) & 0x10000000; // 3 -> 28
-
-        x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
-
-        int qs1 = (ql >>  4) & 0x0F0F0F0F;
-        qs1    |= (qh >> 12) & 0x00000010; // 16 ->  4
-        qs1    |= (qh >>  5) & 0x00001000; // 17 -> 12
-        qs1    |= (qh <<  2) & 0x00100000; // 18 -> 20
-        qs1    |= (qh <<  9) & 0x10000000; // 19 -> 28
-
-        x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI5_1;
-    const int kbxd = k % blocks_per_tile_x_row;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
-        int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
-    }
-}
-
-static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
-
-    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
-    const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
-
-    int u[2*VDR_Q5_1_Q8_1_MMQ];
-
-#pragma unroll
-    for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
-        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
-        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
-    }
-
-    return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
-        (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
-}
-
-template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
-
-    __shared__ int  tile_x_qs[mmq_y * (WARP_SIZE)       + mmq_y];
-    __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
-
-    *x_ql = tile_x_qs;
-    *x_dm = (half2 *) tile_x_d;
-}
-
-template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI8_0;
-    const int kqsx = k % QI8_0;
-    float * x_dmf = (float *) x_dm;
-
-    const block_q8_0 * bx0 = (const block_q8_0 *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
-
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
-    const int kbxd = k % blocks_per_tile_x_row;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
-        int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d;
-    }
-}
-
-static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
-
-    const float * x_dmf = (const float *) x_dm;
-    const float * y_df  = (const float *) y_ds;
-
-    return vec_dot_q8_0_q8_1_impl<float, VDR_Q8_0_Q8_1_MMQ>
-        (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
-         y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
-}
-
-template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    GGML_UNUSED(x_qh);
-
-    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE)       + mmq_y];
-    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
-    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/4)     + mmq_y/4];
-
-    *x_ql = tile_x_ql;
-    *x_dm = tile_x_dm;
-    *x_sc = tile_x_sc;
-}
-
-template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-    GGML_UNUSED(x_qh);
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI2_K;
-    const int kqsx = k % QI2_K;
-
-    const block_q2_K * bx0 = (const block_q2_K *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx;
-
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI2_K;
-    const int kbxd = k % blocks_per_tile_x_row;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
-        int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
-        int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4);
-
-        x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4));
-    }
-}
-
-static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-    GGML_UNUSED(x_qh);
-
-    const int kbx = k / QI2_K;
-    const int ky  = (k % QI2_K) * QR2_K;
-    const float * y_df = (const float *) y_ds;
-
-    int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
-
-    const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
-    const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
-
-#pragma unroll
-    for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
-        v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
-    }
-
-    const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
-
-    const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE;
-    return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
-}
-
-template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-
-    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE)       + mmq_y];
-    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K];
-    __shared__ int   tile_x_qh[mmq_y * (WARP_SIZE/2)     + mmq_y/2];
-    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/4)     + mmq_y/4];
-
-    *x_ql = tile_x_ql;
-    *x_dm = tile_x_dm;
-    *x_qh = tile_x_qh;
-    *x_sc = tile_x_sc;
-}
-
-template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI3_K;
-    const int kqsx = k % QI3_K;
-
-    const block_q3_K * bx0 = (const block_q3_K *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx;
-
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
-    const int kbxd = k % blocks_per_tile_x_row;
-    float * x_dmf = (float *) x_dm;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
-        int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d;
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
-        int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
-
-        // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
-        x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
-        int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
-
-        const int ksc = k % (QI3_K/4);
-
-        const int ksc_low = ksc % (QI3_K/8);
-        const int shift_low = 4 * (ksc / (QI3_K/8));
-        const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
-
-        const int ksc_high = QI3_K/8;
-        const int shift_high = 2 * ksc;
-        const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
-
-        const int sc = __vsubss4(sc_low | sc_high, 0x20202020);
-
-        x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc;
-    }
-}
-
-static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-
-    const int kbx  = k / QI3_K;
-    const int ky  = (k % QI3_K) * QR3_K;
-    const float * x_dmf = (const float *) x_dm;
-    const float * y_df  = (const float *) y_ds;
-
-    const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
-
-    int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
-
-#pragma unroll
-    for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
-        const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
-        const int shift = 2 * ((ky % 32) / 8);
-        const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
-
-        const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
-        const int vlh = (vh << 2) & 0x04040404;
-
-        v[l] = __vsubss4(vll, vlh);
-    }
-
-    const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE;
-    return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
-}
-
-template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    GGML_UNUSED(x_qh);
-
-    __shared__ int   tile_x_ql[mmq_y * (WARP_SIZE)       + mmq_y];
-    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
-    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/8)     + mmq_y/8];
-
-    *x_ql = tile_x_ql;
-    *x_dm = tile_x_dm;
-    *x_sc = tile_x_sc;
-}
-
-template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-    GGML_UNUSED(x_qh);
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI4_K; // == 0 if QK_K == 256
-    const int kqsx = k % QI4_K; // == k if QK_K == 256
-
-    const block_q4_K * bx0 = (const block_q4_K *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;
-
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
-    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
-        int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
-        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
-
-        const int * scales = (const int *) bxi->scales;
-
-        const int ksc = k % (WARP_SIZE/8);
-
-        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
-        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
-        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits
-
-        x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
-    }
-}
-
-static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-    GGML_UNUSED(x_qh);
-
-    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
-
-    const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
-    return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
-                                      x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
-}
-
-template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    GGML_UNUSED(x_qh);
-
-    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
-    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
-    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/8)     + mmq_y/8];
-
-    *x_ql = tile_x_ql;
-    *x_dm = tile_x_dm;
-    *x_sc = tile_x_sc;
-}
-
-template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-    GGML_UNUSED(x_qh);
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI5_K; // == 0 if QK_K == 256
-    const int kqsx = k % QI5_K; // == k if QK_K == 256
-
-    const block_q5_K * bx0 = (const block_q5_K *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
-        const int ky = QR5_K*kqsx;
-
-        const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
-        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
-        const int ql1 = (ql >> 4) & 0x0F0F0F0F;
-
-        const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
-        const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
-        const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
-
-        const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0;
-        const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4);
-
-        x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
-        x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
-    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
-        int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
-        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
-
-        const int * scales = (const int *) bxi->scales;
-
-        const int ksc = k % (WARP_SIZE/8);
-
-        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
-        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
-        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits
-
-        x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
-    }
-}
-
-static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-    GGML_UNUSED(x_qh);
-
-    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
-
-    const int index_x = i * (QR5_K*WARP_SIZE + 1) +  QR5_K*k;
-    const int index_y = j * WARP_SIZE             + (QR5_K*k) % WARP_SIZE;
-    return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
-                                      x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
-}
-
-template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
-    GGML_UNUSED(x_qh);
-
-    __shared__ int   tile_x_ql[mmq_y * (2*WARP_SIZE)     + mmq_y];
-    __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
-    __shared__ int   tile_x_sc[mmq_y * (WARP_SIZE/8)     + mmq_y/8];
-
-    *x_ql = tile_x_ql;
-    *x_dm = tile_x_dm;
-    *x_sc = tile_x_sc;
-}
-
-template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
-    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
-    int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
-    GGML_UNUSED(x_qh);
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI6_K; // == 0 if QK_K == 256
-    const int kqsx = k % QI6_K; // == k if QK_K == 256
-
-    const block_q6_K * bx0 = (const block_q6_K *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
-        const int ky = QR6_K*kqsx;
-
-        const int ql = get_int_from_uint8(bxi->ql, kqsx);
-        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
-        const int ql1 = (ql >> 4) & 0x0F0F0F0F;
-
-        const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
-        const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
-        const int qh1 =  (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4))))       & 0x30303030;
-
-        const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0;
-        const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2);
-
-        x_ql[i * (2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
-        x_ql[i * (2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
-    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
-    float * x_dmf = (float *) x_dm;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
-        int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d;
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
-        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
-
-        if (need_check) {
-            i = min(i, i_max);
-        }
-
-        const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4;
-
-        x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8));
-    }
-}
-
-static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
-    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
-    GGML_UNUSED(x_qh);
-
-    const float * x_dmf = (const float *) x_dm;
-    const float * y_df  = (const float *) y_ds;
-
-    const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]);
-
-    const int index_x = i * (QR6_K*WARP_SIZE + 1) +  QR6_K*k;
-    const int index_y = j * WARP_SIZE             + (QR6_K*k) % WARP_SIZE;
-    return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
-}
-
-template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
-              allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
-static __device__ __forceinline__ void mul_mat_q(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-    const block_q_t  * x = (const block_q_t  *) vx;
-    const block_q8_1 * y = (const block_q8_1 *) vy;
-
-    const int blocks_per_row_x = ncols_x / qk;
-    const int blocks_per_col_y = nrows_y / QK8_1;
-    const int blocks_per_warp = WARP_SIZE / qi;
-
-    const int & ncols_dst = ncols_y;
-
-    const int row_dst_0 = blockIdx.x*mmq_y;
-    const int & row_x_0 = row_dst_0;
-
-    const int col_dst_0 = blockIdx.y*mmq_x;
-    const int & col_y_0 = col_dst_0;
-
-    int   * tile_x_ql = nullptr;
-    half2 * tile_x_dm = nullptr;
-    int   * tile_x_qh = nullptr;
-    int   * tile_x_sc = nullptr;
-
-    allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-
-    __shared__ int    tile_y_qs[mmq_x * WARP_SIZE];
-    __shared__ half2  tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
-
-    float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
-
-    for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
-
-        load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
-                   threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x);
-
-#pragma unroll
-        for (int ir = 0; ir < qr; ++ir) {
-            const int kqs = ir*WARP_SIZE + threadIdx.x;
-            const int kbxd = kqs / QI8_1;
-
-#pragma unroll
-            for (int i = 0; i < mmq_x; i += nwarps) {
-                const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
-
-                const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
-
-                const int index_y = (threadIdx.y + i) * WARP_SIZE + kqs % WARP_SIZE;
-                tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
-            }
-
-#pragma unroll
-            for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
-                const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x;
-                const int kby = threadIdx.x % (WARP_SIZE/QI8_1);
-                const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
-
-                // if the sum is not needed it's faster to transform the scale to f32 ahead of time
-                const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds;
-                half2       * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby];
-                if (need_sum) {
-                    *dsi_dst = *dsi_src;
-                } else {
-                    float * dfi_dst = (float *) dsi_dst;
-                    *dfi_dst = __low2float(*dsi_src);
-                }
-            }
-
-            __syncthreads();
-
-// #pragma unroll // unrolling this loop causes too much register pressure
-            for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) {
-#pragma unroll
-                for (int j = 0; j < mmq_x; j += nwarps) {
-#pragma unroll
-                    for (int i = 0; i < mmq_y; i += WARP_SIZE) {
-                        sum[i/WARP_SIZE][j/nwarps] += vec_dot(
-                            tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
-                            threadIdx.x + i, threadIdx.y + j, k);
-                    }
-                }
-            }
-
-            __syncthreads();
-        }
-    }
-
-#pragma unroll
-    for (int j = 0; j < mmq_x; j += nwarps) {
-        const int col_dst = col_dst_0 + j + threadIdx.y;
-
-        if (col_dst >= ncols_dst) {
-            return;
-        }
-
-#pragma unroll
-        for (int i = 0; i < mmq_y; i += WARP_SIZE) {
-            const int row_dst = row_dst_0 + threadIdx.x + i;
-
-            if (row_dst >= nrows_dst) {
-                continue;
-            }
-
-            dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps];
-        }
-    }
-}
-
-static constexpr __device__ mmq_arch_config_t get_arch_config_device(mmq_config_t mmq_config) {
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-
-#if defined(RDNA3) || defined(RDNA2)
-    return mmq_config.rdna2;
-#else
-    return mmq_config.rdna1;
-#endif // defined(RDNA3) || defined(RDNA2)
-
-#else
-
-#if __CUDA_ARCH__ >= CC_VOLTA
-    return mmq_config.ampere;
-#else
-    return mmq_config.pascal;
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-}
-
-template <bool need_check> static __global__ void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_0.rdna2.nwarps, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-    mul_mat_q4_0(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A
-    constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q4_0);
-
-    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q4_0<arch_config.y>,
-        load_tiles_q4_0<arch_config.y, arch_config.nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
-    GGML_UNUSED(get_arch_config_device);
-    GGML_UNUSED(vec_dot_q4_0_q8_1_mul_mat);
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-template <bool need_check> static __global__ void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_1.rdna2.nwarps, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#elif __CUDA_ARCH__ < CC_VOLTA
-    __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_1.pascal.nwarps, 2)
-#endif // __CUDA_ARCH__ < CC_VOLTA
-    mul_mat_q4_1(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A
-    constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q4_1);
-
-    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q4_1<arch_config.y>,
-        load_tiles_q4_1<arch_config.y, arch_config.nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
-    GGML_UNUSED(get_arch_config_device);
-    GGML_UNUSED(vec_dot_q4_1_q8_1_mul_mat);
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-template <bool need_check> static __global__ void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q5_0.rdna2.nwarps, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-    mul_mat_q5_0(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A
-    constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q5_0);
-
-    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q5_0<arch_config.y>,
-        load_tiles_q5_0<arch_config.y, arch_config.nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
-    GGML_UNUSED(get_arch_config_device);
-    GGML_UNUSED(vec_dot_q5_0_q8_1_mul_mat);
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-template <bool need_check> static __global__ void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q5_1.rdna2.nwarps, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-mul_mat_q5_1(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A
-    constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q5_1);
-
-    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q5_1<arch_config.y>,
-        load_tiles_q5_1<arch_config.y, arch_config.nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
-    GGML_UNUSED(get_arch_config_device);
-    GGML_UNUSED(vec_dot_q5_1_q8_1_mul_mat);
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-template <bool need_check> static __global__ void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q8_0.rdna2.nwarps, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-    mul_mat_q8_0(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A
-    constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q8_0);
-
-    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q8_0<arch_config.y>,
-        load_tiles_q8_0<arch_config.y, arch_config.nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
-    GGML_UNUSED(get_arch_config_device);
-    GGML_UNUSED(vec_dot_q8_0_q8_1_mul_mat);
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-template <bool need_check> static __global__ void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q2_K.rdna2.nwarps, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-mul_mat_q2_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A
-    constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q2_K);
-
-    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q2_K<arch_config.y>,
-        load_tiles_q2_K<arch_config.y, arch_config.nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
-    GGML_UNUSED(get_arch_config_device);
-    GGML_UNUSED(vec_dot_q2_K_q8_1_mul_mat);
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-template <bool need_check> static __global__ void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q3_K.rdna2.nwarps, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#elif __CUDA_ARCH__ < CC_VOLTA
-    __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q3_K.pascal.nwarps, 2)
-#endif // __CUDA_ARCH__ < CC_VOLTA
-    mul_mat_q3_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A
-    constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q3_K);
-
-    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q3_K<arch_config.y>,
-        load_tiles_q3_K<arch_config.y, arch_config.nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
-    GGML_UNUSED(get_arch_config_device);
-    GGML_UNUSED(vec_dot_q3_K_q8_1_mul_mat);
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-template <bool need_check> static __global__ void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_K.rdna2.nwarps, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#elif __CUDA_ARCH__ < CC_VOLTA
-    __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_K.pascal.nwarps, 2)
-#endif // __CUDA_ARCH__ < CC_VOLTA
-    mul_mat_q4_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A
-    constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q4_K);
-
-    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q4_K<arch_config.y>,
-        load_tiles_q4_K<arch_config.y, arch_config.nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
-    GGML_UNUSED(get_arch_config_device);
-    GGML_UNUSED(vec_dot_q4_K_q8_1_mul_mat);
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-template <bool need_check> static __global__ void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q5_K.rdna2.nwarps, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-mul_mat_q5_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A
-    constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q5_K);
-
-    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q5_K<arch_config.y>,
-        load_tiles_q5_K<arch_config.y, arch_config.nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
-    GGML_UNUSED(get_arch_config_device);
-    GGML_UNUSED(vec_dot_q5_K_q8_1_mul_mat);
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-template <bool need_check> static __global__ void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q6_K.rdna2.nwarps, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#elif __CUDA_ARCH__ < CC_VOLTA
-    __launch_bounds__(WARP_SIZE*MMQ_CONFIG_Q4_K.pascal.nwarps, 2)
-#endif // __CUDA_ARCH__ < CC_VOLTA
-    mul_mat_q6_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A
-    constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q6_K);
-
-    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q6_K<arch_config.y>,
-        load_tiles_q6_K<arch_config.y, arch_config.nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
-    GGML_UNUSED(get_arch_config_device);
-    GGML_UNUSED(vec_dot_q6_K_q8_1_mul_mat);
-    NO_DEVICE_CODE;
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-#define MMQ_SWITCH_CASE(type_suffix)                                                                        \
-    case GGML_TYPE_Q##type_suffix: if (row_diff % arch_config.y == 0) {                                     \
-        const bool need_check = false;                                                                      \
-        mul_mat_q##type_suffix<need_check><<<block_nums, block_dims, 0, stream>>>                           \
-            (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst); \
-    } else {                                                                                                \
-        const bool need_check = true;                                                                       \
-        mul_mat_q##type_suffix<need_check><<<block_nums, block_dims, 0, stream>>>                           \
-            (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst); \
-    } break;                                                                                                \
 
 void ggml_cuda_op_mul_mat_q(
     ggml_backend_cuda_context & ctx,
@@ -1454,12 +8,15 @@ void ggml_cuda_op_mul_mat_q(
 
     const int64_t ne00 = src0->ne[0];
 
+    const int64_t nb01 = src0->nb[1];
+
     const int64_t ne10 = src1->ne[0];
     GGML_ASSERT(ne10 % QK8_1 == 0);
 
     const int64_t ne0 = dst->ne[0];
 
     const int64_t row_diff = row_high - row_low;
+    const int64_t stride00 = nb01 / ggml_type_size(src0->type);
 
     int id = ggml_cuda_get_device();
     const int compute_capability = ggml_cuda_info().devices[id].cc;
@@ -1468,78 +25,44 @@ void ggml_cuda_op_mul_mat_q(
     // nrows_dst == nrows of the matrix that the kernel writes into
     const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
 
-    mmq_config_t mmq_config;
+    const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, nrows_dst};
 
     switch (src0->type) {
         case GGML_TYPE_Q4_0:
-            mmq_config = MMQ_CONFIG_Q4_0;
+            mul_mat_q_case<GGML_TYPE_Q4_0>(args, stream);
             break;
         case GGML_TYPE_Q4_1:
-            mmq_config = MMQ_CONFIG_Q4_1;
+            mul_mat_q_case<GGML_TYPE_Q4_1>(args, stream);
             break;
         case GGML_TYPE_Q5_0:
-            mmq_config = MMQ_CONFIG_Q5_0;
+            mul_mat_q_case<GGML_TYPE_Q5_0>(args, stream);
             break;
         case GGML_TYPE_Q5_1:
-            mmq_config = MMQ_CONFIG_Q5_1;
+            mul_mat_q_case<GGML_TYPE_Q5_1>(args, stream);
             break;
         case GGML_TYPE_Q8_0:
-            mmq_config = MMQ_CONFIG_Q8_0;
+            mul_mat_q_case<GGML_TYPE_Q8_0>(args, stream);
             break;
         case GGML_TYPE_Q2_K:
-            mmq_config = MMQ_CONFIG_Q2_K;
+            mul_mat_q_case<GGML_TYPE_Q2_K>(args, stream);
             break;
         case GGML_TYPE_Q3_K:
-            mmq_config = MMQ_CONFIG_Q3_K;
+            mul_mat_q_case<GGML_TYPE_Q3_K>(args, stream);
             break;
         case GGML_TYPE_Q4_K:
-            mmq_config = MMQ_CONFIG_Q4_K;
+            mul_mat_q_case<GGML_TYPE_Q4_K>(args, stream);
             break;
         case GGML_TYPE_Q5_K:
-            mmq_config = MMQ_CONFIG_Q5_K;
+            mul_mat_q_case<GGML_TYPE_Q5_K>(args, stream);
             break;
         case GGML_TYPE_Q6_K:
-            mmq_config = MMQ_CONFIG_Q6_K;
+            mul_mat_q_case<GGML_TYPE_Q6_K>(args, stream);
             break;
         default:
             GGML_ASSERT(false);
             break;
     }
 
-    mmq_arch_config_t arch_config;
-    if (compute_capability >= CC_RDNA2) {
-        arch_config = mmq_config.rdna2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        arch_config = mmq_config.rdna1;
-    } else if (compute_capability >= CC_VOLTA) {
-        arch_config = mmq_config.ampere;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        arch_config = mmq_config.pascal;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (row_diff   + arch_config.y - 1) / arch_config.y;
-    const int block_num_y = (src1_ncols + arch_config.x - 1) / arch_config.x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
-    const dim3 block_dims(WARP_SIZE, arch_config.nwarps, 1);
-
-    switch (src0->type) {
-        MMQ_SWITCH_CASE(4_0)
-        MMQ_SWITCH_CASE(4_1)
-        MMQ_SWITCH_CASE(5_0)
-        MMQ_SWITCH_CASE(5_1)
-        MMQ_SWITCH_CASE(8_0)
-        MMQ_SWITCH_CASE(2_K)
-        MMQ_SWITCH_CASE(3_K)
-        MMQ_SWITCH_CASE(4_K)
-        MMQ_SWITCH_CASE(5_K)
-        MMQ_SWITCH_CASE(6_K)
-        default:
-            GGML_ASSERT(false);
-            break;
-    }
-
     GGML_UNUSED(src1);
     GGML_UNUSED(dst);
     GGML_UNUSED(src1_ddf_i);
diff --git a/ggml-cuda/mmq.cuh b/ggml-cuda/mmq.cuh
index 807817c4a..6744cce6d 100644
--- a/ggml-cuda/mmq.cuh
+++ b/ggml-cuda/mmq.cuh
@@ -1,4 +1,1304 @@
 #include "common.cuh"
+#include "vecdotq.cuh"
+
+#include <climits>
+#include <cstdint>
+
+typedef void (*load_tiles_mmq_t)(
+    const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride);
+typedef void (*vec_dot_mmq_t)(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, float * __restrict__ sum, const int & k0);
+
+struct tile_x_sizes {
+    int ql;
+    int dm;
+    int qh;
+    int sc;
+};
+
+// get_mmq_x_max_host is in common.cuh so that it can be used to determine the correct way to round for --split-mode row
+
+static constexpr __device__ int get_mmq_x_max_device() {
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+    return 64;
+#else
+#if __CUDA_ARCH__ >= CC_VOLTA
+#ifdef CUDA_USE_TENSOR_CORES
+    return MMQ_MAX_BATCH_SIZE;
+#else
+    return 128;
+#endif // CUDA_USE_TENSOR_CORES
+#else
+    return 64;
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+}
+
+// get_mmq_y_host is in common.cuh so that it can be used to determine the correct way to round for --split-mode row
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+static constexpr __device__ int get_mmq_y_device(int mmq_x) {
+    return mmq_x >= 32 ? 128 : 64;
+}
+#else
+#if __CUDA_ARCH__ >= CC_VOLTA
+static constexpr __device__ int get_mmq_y_device(int mmq_x) {
+    return mmq_x >= 32 ? 128 : 64;
+}
+#else
+static constexpr __device__ int get_mmq_y_device(int /*mmq_x*/) {
+    return 64;
+}
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+
+#define TILE_X_SIZES_Q4_0 tile_x_sizes{mmq_y*WARP_SIZE   + mmq_y, mmq_y*WARP_SIZE/QI4_0 + mmq_y/QI4_0, 0,                           0}
+#define TILE_X_SIZES_Q4_1 tile_x_sizes{mmq_y*WARP_SIZE   + mmq_y, mmq_y*WARP_SIZE/QI4_1 + mmq_y/QI4_1, 0,                           0}
+#define TILE_X_SIZES_Q5_0 tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE/QI5_0 + mmq_y/QI5_0, 0,                           0}
+#define TILE_X_SIZES_Q5_1 tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE/QI5_1 + mmq_y/QI5_1, 0,                           0}
+#define TILE_X_SIZES_Q8_0 tile_x_sizes{mmq_y*WARP_SIZE   + mmq_y, mmq_y*WARP_SIZE/QI8_0 + mmq_y/QI8_0, 0,                           0}
+#define TILE_X_SIZES_Q2_K tile_x_sizes{mmq_y*WARP_SIZE   + mmq_y, mmq_y*WARP_SIZE/QI2_K + mmq_y/QI2_K, 0,                           mmq_y*WARP_SIZE/4 + mmq_y/4}
+#define TILE_X_SIZES_Q3_K tile_x_sizes{mmq_y*WARP_SIZE   + mmq_y, mmq_y*WARP_SIZE/QI3_K + mmq_y/QI3_K, mmq_y*WARP_SIZE/2 + mmq_y/2, mmq_y*WARP_SIZE/4 + mmq_y/4}
+#define TILE_X_SIZES_Q4_K tile_x_sizes{mmq_y*WARP_SIZE   + mmq_y, mmq_y*WARP_SIZE/QI4_K + mmq_y/QI4_K, 0,                           mmq_y*WARP_SIZE/8 + mmq_y/8}
+#define TILE_X_SIZES_Q5_K tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE/QI5_K + mmq_y/QI5_K, 0,                           mmq_y*WARP_SIZE/8 + mmq_y/8}
+#define TILE_X_SIZES_Q6_K tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE/QI6_K + mmq_y/QI6_K, 0,                           mmq_y*WARP_SIZE/8 + mmq_y/8}
+
+#define GET_TILE_X_SIZES_BODY                           \
+    return type == GGML_TYPE_Q4_0 ? TILE_X_SIZES_Q4_0 : \
+        type == GGML_TYPE_Q4_1 ? TILE_X_SIZES_Q4_1 :    \
+        type == GGML_TYPE_Q5_0 ? TILE_X_SIZES_Q5_0 :    \
+        type == GGML_TYPE_Q5_1 ? TILE_X_SIZES_Q5_1 :    \
+        type == GGML_TYPE_Q8_0 ? TILE_X_SIZES_Q8_0 :    \
+        type == GGML_TYPE_Q2_K ? TILE_X_SIZES_Q2_K :    \
+        type == GGML_TYPE_Q3_K ? TILE_X_SIZES_Q3_K :    \
+        type == GGML_TYPE_Q4_K ? TILE_X_SIZES_Q4_K :    \
+        type == GGML_TYPE_Q5_K ? TILE_X_SIZES_Q5_K :    \
+        type == GGML_TYPE_Q6_K ? TILE_X_SIZES_Q6_K :    \
+        tile_x_sizes{0, 0, 0, 0}
+
+static tile_x_sizes get_tile_x_sizes_host(const ggml_type type, const int mmq_y) {
+    GET_TILE_X_SIZES_BODY;
+}
+
+template <int mmq_y>
+static constexpr __device__ tile_x_sizes get_tile_x_sizes_device(ggml_type type) {
+    GET_TILE_X_SIZES_BODY;
+}
+
+// ------------------------------------------------------------
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
+    const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride) {
+    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
+
+    const int kbx  = threadIdx.x / QI4_0;
+    const int kqsx = threadIdx.x % QI4_0;
+
+    float * x_dmf = (float *) x_dm;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + threadIdx.y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q4_0 * bxi = (const block_q4_0 *) x + kbx0 + i*stride + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + threadIdx.x] = get_int_from_uint8(bxi->qs, kqsx);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
+    const int kbxd = threadIdx.x % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
+        int i = i0 + threadIdx.y * QI4_0 + threadIdx.x / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q4_0 * bxi = (const block_q4_0 *) x + kbx0 + i*stride + kbxd;
+
+        x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d;
+    }
+}
+
+template <int mmq_x, int mmq_y, int nwarps>
+static __device__ __forceinline__ void vec_dot_q4_0_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
+
+    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
+
+#pragma unroll
+    for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
+        const int j = j0 + threadIdx.y;
+
+#pragma unroll
+        for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
+            const int i = i0 + threadIdx.x;
+
+            const int kyqs = k0 % (QI8_1/2) + QI8_1 * (k0 / (QI8_1/2));
+            const float * x_dmf = (const float *) x_dm;
+
+            int u[2*VDR_Q4_0_Q8_1_MMQ];
+
+#pragma unroll
+            for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
+                u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
+                u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
+            }
+
+            sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
+                (&x_ql[i * (WARP_SIZE + 1) + k0], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k0/QI4_0],
+                y_ds[j * (WARP_SIZE/QI8_1) + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
+        }
+    }
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
+    const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride) {
+    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
+
+    const int kbx  = threadIdx.x / QI4_1;
+    const int kqsx = threadIdx.x % QI4_1;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + threadIdx.y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q4_1 * bxi = (const block_q4_1 *) x + kbx0 + i*stride + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + threadIdx.x] = get_int_from_uint8_aligned(bxi->qs, kqsx);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
+    const int kbxd = threadIdx.x % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
+        int i = i0 + threadIdx.y * QI4_1 + threadIdx.x / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q4_1 * bxi = (const block_q4_1 *) x + kbx0 + i*stride + kbxd;
+
+        x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
+    }
+}
+
+template <int mmq_x, int mmq_y, int nwarps>
+static __device__ __forceinline__ void vec_dot_q4_1_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
+
+    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
+
+#pragma unroll
+    for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
+        const int j = j0 + threadIdx.y;
+
+#pragma unroll
+        for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
+            const int i = i0 + threadIdx.x;
+
+            const int kyqs = k0 % (QI8_1/2) + QI8_1 * (k0 / (QI8_1/2));
+
+            int u[2*VDR_Q4_1_Q8_1_MMQ];
+
+#pragma unroll
+            for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
+                u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
+                u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
+            }
+
+            sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
+                (&x_ql[i * (WARP_SIZE + 1) + k0], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k0/QI4_1],
+                y_ds[j * (WARP_SIZE/QI8_1) + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
+        }
+    }
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
+    const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride) {
+    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
+
+    const int kbx  = threadIdx.x / QI5_0;
+    const int kqsx = threadIdx.x % QI5_0;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + threadIdx.y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q5_0 * bxi = (const block_q5_0 *) x + kbx0 + i*stride + kbx;
+
+        const int ql = get_int_from_uint8(bxi->qs, kqsx);
+        const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (threadIdx.x % QI5_0));
+
+        int qs0 = (ql >>  0)   & 0x0F0F0F0F;
+        qs0    |= (qh <<  4)   & 0x00000010;  // 0 ->  4
+        qs0    |= (qh << 11)   & 0x00001000;  // 1 -> 12
+        qs0    |= (qh << 18)   & 0x00100000;  // 2 -> 20
+        qs0    |= (qh << 25)   & 0x10000000;  // 3 -> 28
+        qs0     = __vsubss4(qs0, 0x10101010); // subtract 16
+
+        x_ql[i * (2*WARP_SIZE + 1) + 2*threadIdx.x+0] = qs0;
+
+        int qs1 = (ql >>  4)   & 0x0F0F0F0F;
+        qs1    |= (qh >> 12)   & 0x00000010;  // 16 ->  4
+        qs1    |= (qh >>  5)   & 0x00001000;  // 17 -> 12
+        qs1    |= (qh <<  2)   & 0x00100000;  // 18 -> 20
+        qs1    |= (qh <<  9)   & 0x10000000;  // 19 -> 28
+        qs1     = __vsubss4(qs1, 0x10101010); // subtract 16
+
+        x_ql[i * (2*WARP_SIZE + 1) + 2*threadIdx.x+1] = qs1;
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI5_0;
+    const int kbxd = threadIdx.x % blocks_per_tile_x_row;
+    float * x_dmf = (float *) x_dm;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
+        int i = i0 + threadIdx.y * QI5_0 + threadIdx.x / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q5_0 * bxi = (const block_q5_0 *) x + kbx0 + i*stride + kbxd;
+
+        x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d;
+    }
+}
+
+template <int mmq_x, int mmq_y, int nwarps>
+static __device__ __forceinline__ void vec_dot_q5_0_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
+
+    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
+
+#pragma unroll
+    for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
+        const int j = j0 + threadIdx.y;
+
+#pragma unroll
+        for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
+            const int i = i0 + threadIdx.x;
+
+            const int kyqs = k0 % (QI8_1/2) + QI8_1 * (k0 / (QI8_1/2));
+            const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k0/QI5_0;
+            const float * x_dmf = (const float *) x_dm;
+            const float * y_df  = (const float *) y_ds;
+
+            int u[2*VDR_Q5_0_Q8_1_MMQ];
+
+#pragma unroll
+            for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
+                u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
+                u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
+            }
+
+            sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q8_0_q8_1_impl<float, QR5_0*VDR_Q5_0_Q8_1_MMQ>
+                (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k0], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
+        }
+    }
+}
+
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
+    const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride) {
+    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
+
+    const int kbx  = threadIdx.x / QI5_1;
+    const int kqsx = threadIdx.x % QI5_1;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + threadIdx.y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q5_1 * bxi = (const block_q5_1 *) x + kbx0 + i*stride + kbx;
+
+        const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
+        const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (threadIdx.x % QI5_1));
+
+        int qs0 = (ql >>  0) & 0x0F0F0F0F;
+        qs0    |= (qh <<  4) & 0x00000010; // 0 ->  4
+        qs0    |= (qh << 11) & 0x00001000; // 1 -> 12
+        qs0    |= (qh << 18) & 0x00100000; // 2 -> 20
+        qs0    |= (qh << 25) & 0x10000000; // 3 -> 28
+
+        x_ql[i * (2*WARP_SIZE + 1) + 2*threadIdx.x+0] = qs0;
+
+        int qs1 = (ql >>  4) & 0x0F0F0F0F;
+        qs1    |= (qh >> 12) & 0x00000010; // 16 ->  4
+        qs1    |= (qh >>  5) & 0x00001000; // 17 -> 12
+        qs1    |= (qh <<  2) & 0x00100000; // 18 -> 20
+        qs1    |= (qh <<  9) & 0x10000000; // 19 -> 28
+
+        x_ql[i * (2*WARP_SIZE + 1) + 2*threadIdx.x+1] = qs1;
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI5_1;
+    const int kbxd = threadIdx.x % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
+        int i = i0 + threadIdx.y * QI5_1 + threadIdx.x / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q5_1 * bxi = (const block_q5_1 *) x + kbx0 + i*stride + kbxd;
+
+        x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
+    }
+}
+
+template <int mmq_x, int mmq_y, int nwarps>
+static __device__ __forceinline__ void vec_dot_q5_1_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
+
+    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
+
+#pragma unroll
+    for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
+        const int j = j0 + threadIdx.y;
+
+#pragma unroll
+        for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
+            const int i = i0 + threadIdx.x;
+
+            const int kyqs = k0 % (QI8_1/2) + QI8_1 * (k0 / (QI8_1/2));
+            const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k0/QI5_1;
+
+            int u[2*VDR_Q5_1_Q8_1_MMQ];
+
+#pragma unroll
+            for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
+                u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
+                u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
+            }
+
+            sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
+                (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k0], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
+        }
+    }
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
+    const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride) {
+
+    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
+
+    const int kbx  = threadIdx.x / QI8_0;
+    const int kqsx = threadIdx.x % QI8_0;
+    float * x_dmf = (float *) x_dm;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + threadIdx.y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q8_0 * bxi = (const block_q8_0 *) x + kbx0 + i*stride + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + threadIdx.x] = get_int_from_int8(bxi->qs, kqsx);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
+    const int kbxd = threadIdx.x % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
+        int i = i0 + threadIdx.y * QI8_0 + threadIdx.x / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q8_0 * bxi = (const block_q8_0 *) x + kbx0 + i*stride + kbxd;
+
+        x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d;
+    }
+}
+
+template <int mmq_x, int mmq_y, int nwarps>
+static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
+
+    GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
+
+#pragma unroll
+    for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
+        const int j = j0 + threadIdx.y;
+
+#pragma unroll
+        for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
+            const int i = i0 + threadIdx.x;
+
+            const float * x_dmf = (const float *) x_dm;
+            const float * y_df  = (const float *) y_ds;
+
+            sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q8_0_q8_1_impl<float, VDR_Q8_0_Q8_1_MMQ>
+                (&x_ql[i * (WARP_SIZE + 1) + k0], &y_qs[j * WARP_SIZE + k0], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k0/QI8_0],
+                y_df[j * (WARP_SIZE/QI8_1) + k0/QI8_1]);
+        }
+    }
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
+    const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride) {
+    GGML_UNUSED(x_qh);
+
+    const int kbx  = threadIdx.x / QI2_K;
+    const int kqsx = threadIdx.x % QI2_K;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + threadIdx.y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q2_K * bxi = (const block_q2_K *) x + kbx0 + i*stride + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + threadIdx.x] = get_int_from_uint8_aligned(bxi->qs, kqsx);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI2_K;
+    const int kbxd = threadIdx.x % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
+        int i = (i0 + threadIdx.y * QI2_K + threadIdx.x / blocks_per_tile_x_row) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q2_K * bxi = (const block_q2_K *) x + kbx0 + i*stride + kbxd;
+
+        x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
+        int i = i0 + threadIdx.y * 4 + threadIdx.x / (WARP_SIZE/4);
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q2_K * bxi = (const block_q2_K *) x + kbx0 + i*stride + (threadIdx.x % (WARP_SIZE/4)) / (QI2_K/4);
+
+        x_sc[i * (WARP_SIZE/4) + i / 4 + threadIdx.x % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, threadIdx.x % (QI2_K/4));
+    }
+}
+
+template <int mmq_x, int mmq_y, int nwarps>
+static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
+
+    GGML_UNUSED(x_qh);
+
+#pragma unroll
+    for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
+        const int j = j0 + threadIdx.y;
+
+#pragma unroll
+        for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
+            const int i = i0 + threadIdx.x;
+
+            const int kbx = k0 / QI2_K;
+            const int ky  = (k0 % QI2_K) * QR2_K;
+            const float * y_df = (const float *) y_ds;
+
+            int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
+
+            const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
+            const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
+
+#pragma unroll
+            for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
+                v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
+            }
+
+            const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
+
+            const int index_y = j * WARP_SIZE + (QR2_K*k0) % WARP_SIZE;
+            sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q2_K_q8_1_impl_mmq(
+                v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
+        }
+    }
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q3_K(
+    const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride) {
+
+    const int kbx  = threadIdx.x / QI3_K;
+    const int kqsx = threadIdx.x % QI3_K;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + threadIdx.y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q3_K * bxi = (const block_q3_K *) x + kbx0 + i*stride + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + threadIdx.x] = get_int_from_uint8(bxi->qs, kqsx);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
+    const int kbxd = threadIdx.x % blocks_per_tile_x_row;
+    float * x_dmf = (float *) x_dm;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
+        int i = (i0 + threadIdx.y * QI3_K + threadIdx.x / blocks_per_tile_x_row) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q3_K * bxi = (const block_q3_K *) x + kbx0 + i*stride + kbxd;
+
+        x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d;
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
+        int i = i0 + threadIdx.y * 2 + threadIdx.x / (WARP_SIZE/2);
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q3_K * bxi = (const block_q3_K *) x + kbx0 + i*stride + (threadIdx.x % (WARP_SIZE/2)) / (QI3_K/2);
+
+        // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
+        x_qh[i * (WARP_SIZE/2) + i / 2 + threadIdx.x % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, threadIdx.x % (QI3_K/2));
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
+        int i = i0 + threadIdx.y * 4 + threadIdx.x / (WARP_SIZE/4);
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q3_K * bxi = (const block_q3_K *) x + kbx0 + i*stride + (threadIdx.x % (WARP_SIZE/4)) / (QI3_K/4);
+
+        const int ksc = threadIdx.x % (QI3_K/4);
+
+        const int ksc_low = ksc % (QI3_K/8);
+        const int shift_low = 4 * (ksc / (QI3_K/8));
+        const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
+
+        const int ksc_high = QI3_K/8;
+        const int shift_high = 2 * ksc;
+        const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
+
+        const int sc = __vsubss4(sc_low | sc_high, 0x20202020);
+
+        x_sc[i * (WARP_SIZE/4) + i / 4 + threadIdx.x % (WARP_SIZE/4)] = sc;
+    }
+}
+
+template <int mmq_x, int mmq_y, int nwarps>
+static __device__ __forceinline__ void vec_dot_q3_K_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
+
+#pragma unroll
+    for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
+        const int j = j0 + threadIdx.y;
+
+#pragma unroll
+        for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
+            const int i = i0 + threadIdx.x;
+
+            const int kbx  = k0 / QI3_K;
+            const int ky  = (k0 % QI3_K) * QR3_K;
+            const float * x_dmf = (const float *) x_dm;
+            const float * y_df  = (const float *) y_ds;
+
+            const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
+
+            int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
+
+#pragma unroll
+            for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
+                const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
+                const int shift = 2 * ((ky % 32) / 8);
+                const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
+
+                const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
+                const int vlh = (vh << 2) & 0x04040404;
+
+                v[l] = __vsubss4(vll, vlh);
+            }
+
+            const int index_y = j * WARP_SIZE + (k0*QR3_K) % WARP_SIZE;
+            sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q3_K_q8_1_impl_mmq(
+                v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
+        }
+    }
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
+    const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride) {
+    GGML_UNUSED(x_qh);
+
+    const int kbx  = 0;           // threadIdx.x / QI4_K
+    const int kqsx = threadIdx.x; // threadIdx.x % QI4_K
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + threadIdx.y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q4_K * bxi = (const block_q4_K *) x + kbx0 + i*stride + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + threadIdx.x] = get_int_from_uint8_aligned(bxi->qs, kqsx);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI4_K;  // == 1 if QK_K == 256
+    const int kbxd = threadIdx.x % blocks_per_tile_x_row; // == 0 if QK_K == 256
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
+        int i = (i0 + threadIdx.y * QI4_K + threadIdx.x / blocks_per_tile_x_row) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q4_K * bxi = (const block_q4_K *) x + kbx0 + i*stride + kbxd;
+
+        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
+        int i = (i0 + threadIdx.y * 8 + threadIdx.x / (WARP_SIZE/8)) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q4_K * bxi = (const block_q4_K *) x + kbx0 + i*stride + (threadIdx.x % (WARP_SIZE/8)) / (QI4_K/8);
+
+        const int * scales = (const int *) bxi->scales;
+
+        const int ksc = threadIdx.x % (WARP_SIZE/8);
+
+        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
+        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
+        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits
+
+        x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
+    }
+}
+
+template <int mmq_x, int mmq_y, int nwarps>
+static __device__ __forceinline__ void vec_dot_q4_K_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
+
+    GGML_UNUSED(x_qh);
+
+#pragma unroll
+    for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
+        const int j = j0 + threadIdx.y;
+
+#pragma unroll
+        for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
+            const int i = i0 + threadIdx.x;
+
+            const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/16]) + 2*((k0 % 16) / 8);
+
+            const int index_y = j * WARP_SIZE + (QR4_K*k0) % WARP_SIZE;
+            sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_K_q8_1_impl_mmq(
+                &x_ql[i * (WARP_SIZE + 1) + k0], &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
+        }
+    }
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
+    const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride) {
+    GGML_UNUSED(x_qh);
+
+    const int kbx  = 0;           // threadIdx.x / QI5_K
+    const int kqsx = threadIdx.x; // threadIdx.x % QI5_K
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + threadIdx.y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q5_K * bxi = (const block_q5_K *) x + kbx0 + i*stride + kbx;
+        const int ky = QR5_K*kqsx;
+
+        const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
+        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
+        const int ql1 = (ql >> 4) & 0x0F0F0F0F;
+
+        const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
+        const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
+        const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
+
+        const int kq0 = ky - ky % (QI5_K/2) + threadIdx.x % (QI5_K/4) + 0;
+        const int kq1 = ky - ky % (QI5_K/2) + threadIdx.x % (QI5_K/4) + (QI5_K/4);
+
+        x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
+        x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI5_K;  // == 1 if QK_K == 256
+    const int kbxd = threadIdx.x % blocks_per_tile_x_row; // == 0 if QK_K == 256
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
+        int i = (i0 + threadIdx.y * QI5_K + threadIdx.x / blocks_per_tile_x_row) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q5_K * bxi = (const block_q5_K *) x + kbx0 + i*stride + kbxd;
+
+        x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
+        int i = (i0 + threadIdx.y * 8 + threadIdx.x / (WARP_SIZE/8)) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q5_K * bxi = (const block_q5_K *) x + kbx0 + i*stride + (threadIdx.x % (WARP_SIZE/8)) / (QI5_K/8);
+
+        const int * scales = (const int *) bxi->scales;
+
+        const int ksc = threadIdx.x % (WARP_SIZE/8);
+
+        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
+        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
+        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits
+
+        x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
+    }
+}
+
+template <int mmq_x, int mmq_y, int nwarps>
+static __device__ __forceinline__ void vec_dot_q5_K_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
+
+    GGML_UNUSED(x_qh);
+
+#pragma unroll
+    for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
+        const int j = j0 + threadIdx.y;
+
+#pragma unroll
+        for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
+            const int i = i0 + threadIdx.x;
+
+            const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/16]) + 2 * ((k0 % 16) / 8);
+
+            const int index_x = i * (QR5_K*WARP_SIZE + 1) +  QR5_K*k0;
+            const int index_y = j * WARP_SIZE             + (QR5_K*k0) % WARP_SIZE;
+            sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q5_K_q8_1_impl_mmq(
+                &x_ql[index_x], &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
+        }
+    }
+}
+
+template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
+    const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride) {
+    GGML_UNUSED(x_qh);
+
+    const int kbx  = 0;           // threadIdx.x / QI6_K
+    const int kqsx = threadIdx.x; // threadIdx.x % QI6_K
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + threadIdx.y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q6_K * bxi = (const block_q6_K *) x + kbx0 + i*stride + kbx;
+        const int ky = QR6_K*kqsx;
+
+        const int ql = get_int_from_uint8(bxi->ql, kqsx);
+        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
+        const int ql1 = (ql >> 4) & 0x0F0F0F0F;
+
+        const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
+        const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
+        const int qh1 =  (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4))))       & 0x30303030;
+
+        const int kq0 = ky - ky % QI6_K + threadIdx.x % (QI6_K/2) + 0;
+        const int kq1 = ky - ky % QI6_K + threadIdx.x % (QI6_K/2) + (QI6_K/2);
+
+        x_ql[i * (2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020);
+        x_ql[i * (2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI6_K;  // == 1 if QK_K == 256
+    const int kbxd = threadIdx.x % blocks_per_tile_x_row; // == 0 if QK_K == 256
+    float * x_dmf = (float *) x_dm;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
+        int i = (i0 + threadIdx.y * QI6_K + threadIdx.x / blocks_per_tile_x_row) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q6_K * bxi = (const block_q6_K *) x + kbx0 + i*stride + kbxd;
+
+        x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d;
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
+        int i = (i0 + threadIdx.y * 8 + threadIdx.x / (WARP_SIZE/8)) % mmq_y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q6_K * bxi = (const block_q6_K *) x + kbx0 + i*stride + (threadIdx.x % (WARP_SIZE/8)) / 4;
+
+        x_sc[i * (WARP_SIZE/8) + i / 8 + threadIdx.x % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, threadIdx.x % (QI6_K/8));
+    }
+}
+
+template <int mmq_x, int mmq_y, int nwarps>
+static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
+
+    GGML_UNUSED(x_qh);
+
+#pragma unroll
+    for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
+        const int j = j0 + threadIdx.y;
+
+#pragma unroll
+        for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
+            const int i = i0 + threadIdx.x;
+
+            const float * x_dmf = (const float *) x_dm;
+            const float * y_df  = (const float *) y_ds;
+
+            const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/8]);
+
+            const int index_x = i * (QR6_K*WARP_SIZE + 1) +  QR6_K*k0;
+            const int index_y = j * WARP_SIZE             + (QR6_K*k0) % WARP_SIZE;
+            sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q6_K_q8_1_impl_mmq(
+                &x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
+        }
+    }
+}
+
+// -------------------------------------------------------------------------------------------------------------------------------------
+
+template <int mmq_x, int mmq_y, int nwarps, bool need_check, ggml_type type>
+struct mmq_type_traits;
+
+template <int mmq_x, int mmq_y, int nwarps, bool need_check>
+struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_0> {
+    static constexpr bool             need_sum   = true;
+    static constexpr int              vdr        = VDR_Q4_0_Q8_1_MMQ;
+    static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_0<mmq_y, nwarps, need_check>;
+    static constexpr vec_dot_mmq_t    vec_dot    = vec_dot_q4_0_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
+};
+
+template <int mmq_x, int mmq_y, int nwarps, bool need_check>
+struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_1> {
+    static constexpr bool             need_sum   = true;
+    static constexpr int              vdr        = VDR_Q4_1_Q8_1_MMQ;
+    static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_1<mmq_y, nwarps, need_check>;
+    static constexpr vec_dot_mmq_t    vec_dot    = vec_dot_q4_1_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
+};
+
+template <int mmq_x, int mmq_y, int nwarps, bool need_check>
+struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_0> {
+    static constexpr bool             need_sum   = false;
+    static constexpr int              vdr        = VDR_Q5_0_Q8_1_MMQ;
+    static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_0<mmq_y, nwarps, need_check>;
+    static constexpr vec_dot_mmq_t    vec_dot    = vec_dot_q5_0_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
+};
+
+template <int mmq_x, int mmq_y, int nwarps, bool need_check>
+struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_1> {
+    static constexpr bool             need_sum   = true;
+    static constexpr int              vdr        = VDR_Q5_1_Q8_1_MMQ;
+    static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_1<mmq_y, nwarps, need_check>;
+    static constexpr vec_dot_mmq_t    vec_dot    = vec_dot_q5_1_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
+};
+
+template <int mmq_x, int mmq_y, int nwarps, bool need_check>
+struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q8_0> {
+    static constexpr bool             need_sum   = false;
+    static constexpr int              vdr        = VDR_Q8_0_Q8_1_MMQ;
+    static constexpr load_tiles_mmq_t load_tiles = load_tiles_q8_0<mmq_y, nwarps, need_check>;
+    static constexpr vec_dot_mmq_t    vec_dot    = vec_dot_q8_0_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
+};
+
+template <int mmq_x, int mmq_y, int nwarps, bool need_check>
+struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q2_K> {
+    static constexpr bool             need_sum   = false;
+    static constexpr int              vdr        = VDR_Q2_K_Q8_1_MMQ;
+    static constexpr load_tiles_mmq_t load_tiles = load_tiles_q2_K<mmq_y, nwarps, need_check>;
+    static constexpr vec_dot_mmq_t    vec_dot    = vec_dot_q2_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
+};
+
+template <int mmq_x, int mmq_y, int nwarps, bool need_check>
+struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q3_K> {
+    static constexpr bool             need_sum   = false;
+    static constexpr int              vdr        = VDR_Q3_K_Q8_1_MMQ;
+    static constexpr load_tiles_mmq_t load_tiles = load_tiles_q3_K<mmq_y, nwarps, need_check>;
+    static constexpr vec_dot_mmq_t    vec_dot    = vec_dot_q3_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
+};
+
+template <int mmq_x, int mmq_y, int nwarps, bool need_check>
+struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_K> {
+    static constexpr bool             need_sum   = true;
+    static constexpr int              vdr        = VDR_Q4_K_Q8_1_MMQ;
+    static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_K<mmq_y, nwarps, need_check>;
+    static constexpr vec_dot_mmq_t    vec_dot    = vec_dot_q4_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
+};
+
+template <int mmq_x, int mmq_y, int nwarps, bool need_check>
+struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_K> {
+    static constexpr bool             need_sum   = true;
+    static constexpr int              vdr        = VDR_Q5_K_Q8_1_MMQ;
+    static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_K<mmq_y, nwarps, need_check>;
+    static constexpr vec_dot_mmq_t    vec_dot    = vec_dot_q5_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
+};
+
+template <int mmq_x, int mmq_y, int nwarps, bool need_check>
+struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q6_K> {
+    static constexpr bool             need_sum   = false;
+    static constexpr int              vdr        = VDR_Q6_K_Q8_1_MMQ;
+    static constexpr load_tiles_mmq_t load_tiles = load_tiles_q6_K<mmq_y, nwarps, need_check>;
+    static constexpr vec_dot_mmq_t    vec_dot    = vec_dot_q6_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
+};
+
+template <ggml_type type, int mmq_x, int nwarps, bool need_check>
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*nwarps, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#else
+#if __CUDA_ARCH__ >= CC_VOLTA
+    __launch_bounds__(WARP_SIZE*nwarps, 1)
+#else
+    __launch_bounds__(WARP_SIZE*nwarps, type == GGML_TYPE_Q2_K ? 1 : 2)
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+static __global__ void mul_mat_q(
+    const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst,
+    const int ne00, const int ne01, const int stride00, const int ne10, const int ne11, const int ne0) {
+
+    // Skip unused template specializations for faster compilation:
+    if (mmq_x > get_mmq_x_max_device()) {
+        NO_DEVICE_CODE;
+        return;
+    }
+
+    constexpr int              qk         = ggml_cuda_type_traits<type>::qk;
+    constexpr int              qr         = ggml_cuda_type_traits<type>::qr;
+    constexpr int              qi         = ggml_cuda_type_traits<type>::qi;
+    constexpr int              mmq_y      = get_mmq_y_device(mmq_x);
+    constexpr bool             need_sum   = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::need_sum;
+    constexpr int              vdr        = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::vdr;
+    constexpr load_tiles_mmq_t load_tiles = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::load_tiles;
+    constexpr vec_dot_mmq_t    vec_dot    = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::vec_dot;
+
+    constexpr tile_x_sizes txs = get_tile_x_sizes_device<mmq_y>(type);
+
+    extern __shared__ char data_mul_mat_q[];
+    int   * tile_x_ql = (int   *)  data_mul_mat_q;
+    half2 * tile_x_dm = (half2 *) (tile_x_ql + txs.ql);
+    int   * tile_x_qh = (int   *) (tile_x_dm + txs.dm);
+    int   * tile_x_sc = (int   *) (tile_x_qh + txs.qh);
+    int   * tile_y_qs = (int   *) (tile_x_sc + txs.sc);          // [mmq_x * WARP_SIZE]
+    half2 * tile_y_ds = (half2 *) (tile_y_qs + mmq_x*WARP_SIZE); // [mmq_x * WARP_SIZE/QI8_1];
+
+    const block_q8_1 * y = (const block_q8_1 *) yc;
+
+    const int blocks_per_row_x = ne00 / qk;
+    const int blocks_per_col_y = ne10 / QK8_1;
+    const int blocks_per_warp = WARP_SIZE / qi;
+
+    const int & ne1 = ne11;
+
+    const int tile_x_max_i = ne01 - blockIdx.x*mmq_y - 1;
+
+    float sum[(mmq_x/nwarps) * (mmq_y/WARP_SIZE)] = {0.0f};
+
+    for (int kb0 = 0; kb0 < blocks_per_row_x; kb0 += blocks_per_warp) {
+
+        load_tiles(x, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, stride00*blockIdx.x*mmq_y + kb0, tile_x_max_i, stride00);
+
+#pragma unroll
+        for (int kr = 0; kr < qr; ++kr) {
+            const int kqs = kr*WARP_SIZE + threadIdx.x;
+            const int kbxd = kqs / QI8_1;
+
+#pragma unroll
+            for (int i0 = 0; i0 < mmq_x; i0 += nwarps) {
+                const int i = min(blockIdx.y*mmq_x + threadIdx.y + i0, ne11-1); // to prevent out-of-bounds memory accesses
+
+                const block_q8_1 * by0 = &y[i*blocks_per_col_y + kb0 * (qk/QK8_1) + kbxd];
+
+                const int index_y = (i0 + threadIdx.y) * WARP_SIZE + kqs % WARP_SIZE;
+                tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
+            }
+
+#pragma unroll
+            for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
+                const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x;
+                const int kby = threadIdx.x % (WARP_SIZE/QI8_1);
+                const int i_y_eff = min(blockIdx.y*mmq_x + ids, ne11-1);
+
+                // if the sum is not needed it's faster to transform the scale to f32 ahead of time
+                const half2 * dsi_src = &y[i_y_eff*blocks_per_col_y + kb0 * (qk/QK8_1) + kr*(WARP_SIZE/QI8_1) + kby].ds;
+                half2       * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby];
+                if (need_sum) {
+                    *dsi_dst = *dsi_src;
+                } else {
+                    float * dfi_dst = (float *) dsi_dst;
+                    *dfi_dst = __low2float(*dsi_src);
+                }
+            }
+
+            __syncthreads();
+
+// #pragma unroll // unrolling this loop causes too much register pressure
+            for (int k0 = kr*WARP_SIZE/qr; k0 < (kr+1)*WARP_SIZE/qr; k0 += vdr) {
+                vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds, sum, k0);
+            }
+
+            __syncthreads();
+        }
+    }
+
+#pragma unroll
+    for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
+        const int j = blockIdx.y*mmq_x + j0 + threadIdx.y;
+
+        if (j >= ne1) {
+            return;
+        }
+
+#pragma unroll
+        for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
+            const int i = blockIdx.x*mmq_y + i0 + threadIdx.x;
+
+            if (need_check && i >= ne0) {
+                continue;
+            }
+
+            dst[j*ne0 + i] = sum[(j0/nwarps) * (mmq_y/WARP_SIZE) + i0/WARP_SIZE];
+        }
+    }
+}
+
+struct mmq_args {
+    const char * x; const char * y; float * dst;
+    int64_t ne00; int64_t ne01; int64_t stride00;
+    int64_t ne10; int64_t ne11;
+    int64_t ne0;
+};
+
+template <ggml_type type, int mmq_x, int nwarps>
+static void launch_mul_mat_q(const mmq_args & args, cudaStream_t stream) {
+    const int id = ggml_cuda_get_device();
+    const int cc = ggml_cuda_info().devices[id].cc;
+    const int mmq_y = get_mmq_y_host(cc, mmq_x);
+
+    const int block_num_x = (args.ne01 + mmq_y - 1) / mmq_y;
+    const int block_num_y = (args.ne11 + mmq_x - 1) / mmq_x;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, nwarps, 1);
+
+    const tile_x_sizes txs = get_tile_x_sizes_host(type, mmq_y);
+    const int shmem_x = txs.ql*sizeof(int) + txs.dm*sizeof(half2) + txs.qh*sizeof(int) + txs.sc*sizeof(int);
+    const int shmem_y = mmq_x*WARP_SIZE*sizeof(int) + mmq_x*(WARP_SIZE/QI8_1)*sizeof(half2);
+    const int shmem = shmem_x + shmem_y;
+
+#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+    static bool shmem_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
+    if (!shmem_limit_raised[id]) {
+        CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, nwarps, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
+        CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, nwarps, true>,  cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
+        shmem_limit_raised[id] = true;
+    }
+#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
+
+    if (args.ne01 % mmq_y == 0) {
+        const bool need_check = false;
+        mul_mat_q<type, mmq_x, nwarps, need_check><<<block_nums, block_dims, shmem, stream>>>
+            (args.x, args.y, args.dst, args.ne00, args.ne01, args.stride00, args.ne10, args.ne11, args.ne0);
+    } else {
+        const bool need_check = true;
+        mul_mat_q<type, mmq_x, nwarps, need_check><<<block_nums, block_dims, shmem, stream>>>
+            (args.x, args.y, args.dst, args.ne00, args.ne01, args.stride00, args.ne10, args.ne11, args.ne0);
+    }
+}
+
+template <ggml_type type>
+void mul_mat_q_case(const mmq_args & args, cudaStream_t stream) {
+    const int id = ggml_cuda_get_device();
+    const int nsm = ggml_cuda_info().devices[id].nsm;
+    const int cc  = ggml_cuda_info().devices[id].cc;
+
+    const int mmq_x_max = get_mmq_x_max_host(cc);
+    const int mmq_y = get_mmq_y_host(cc, mmq_x_max);
+    const int block_num_y = (args.ne01 + mmq_y - 1) / mmq_y;
+
+    int mmq_x_best  = 0;
+    int nwaves_best = INT_MAX;
+
+    for (int mmq_x = 8; mmq_x <= mmq_x_max && nwaves_best > 1; mmq_x += 8) {
+        const int block_num_x = (args.ne11 + mmq_x - 1) / mmq_x;
+        const int nwaves = (block_num_x*block_num_y + nsm - 1) / nsm;
+
+        if (nwaves < nwaves_best) {
+            mmq_x_best  = mmq_x;
+            nwaves_best = nwaves;
+        }
+    }
+
+    switch (mmq_x_best) {
+        case   8:
+            launch_mul_mat_q<type,   8, 4>(args, stream);
+            break;
+        case  16:
+            launch_mul_mat_q<type,  16, 8>(args, stream);
+            break;
+        case  24:
+            launch_mul_mat_q<type,  24, 8>(args, stream);
+            break;
+        case  32:
+            launch_mul_mat_q<type,  32, 8>(args, stream);
+            break;
+        case  40:
+            launch_mul_mat_q<type,  40, 8>(args, stream);
+            break;
+        case  48:
+            launch_mul_mat_q<type,  48, 8>(args, stream);
+            break;
+        case  56:
+            launch_mul_mat_q<type,  56, 8>(args, stream);
+            break;
+        case  64:
+            launch_mul_mat_q<type,  64, 8>(args, stream);
+            break;
+        case  72:
+            launch_mul_mat_q<type,  72, 8>(args, stream);
+            break;
+        case  80:
+            launch_mul_mat_q<type,  80, 8>(args, stream);
+            break;
+        case  88:
+            launch_mul_mat_q<type,  88, 8>(args, stream);
+            break;
+        case  96:
+            launch_mul_mat_q<type,  96, 8>(args, stream);
+            break;
+        case 104:
+            launch_mul_mat_q<type, 104, 8>(args, stream);
+            break;
+        case 112:
+            launch_mul_mat_q<type, 112, 8>(args, stream);
+            break;
+        case 120:
+            launch_mul_mat_q<type, 120, 8>(args, stream);
+            break;
+        case 128:
+            launch_mul_mat_q<type, 128, 8>(args, stream);
+            break;
+        default:
+            GGML_ASSERT(false);
+            break;
+    }
+}
+
+#define DECL_MMQ_CASE(type)                                                        \
+    template void mul_mat_q_case<type>(const mmq_args & args, cudaStream_t stream) \
+
+extern DECL_MMQ_CASE(GGML_TYPE_Q4_0);
+extern DECL_MMQ_CASE(GGML_TYPE_Q4_1);
+extern DECL_MMQ_CASE(GGML_TYPE_Q5_0);
+extern DECL_MMQ_CASE(GGML_TYPE_Q5_1);
+extern DECL_MMQ_CASE(GGML_TYPE_Q8_0);
+extern DECL_MMQ_CASE(GGML_TYPE_Q2_K);
+extern DECL_MMQ_CASE(GGML_TYPE_Q3_K);
+extern DECL_MMQ_CASE(GGML_TYPE_Q4_K);
+extern DECL_MMQ_CASE(GGML_TYPE_Q5_K);
+extern DECL_MMQ_CASE(GGML_TYPE_Q6_K);
+
+// -------------------------------------------------------------------------------------------------------------------------
 
 void ggml_cuda_op_mul_mat_q(
     ggml_backend_cuda_context & ctx,
diff --git a/ggml-cuda/mmvq.cu b/ggml-cuda/mmvq.cu
index 65cc1bcaa..5f056e91e 100644
--- a/ggml-cuda/mmvq.cu
+++ b/ggml-cuda/mmvq.cu
@@ -1,9 +1,47 @@
 #include "mmvq.cuh"
 #include "vecdotq.cuh"
 
-typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
+typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs);
 
-template <int ncols_y, int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
+static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type) {
+    return type == GGML_TYPE_Q4_0 ? vec_dot_q4_0_q8_1 :
+        type == GGML_TYPE_Q4_1 ? vec_dot_q4_1_q8_1 :
+        type == GGML_TYPE_Q5_0 ? vec_dot_q5_0_q8_1 :
+        type == GGML_TYPE_Q5_1 ? vec_dot_q5_1_q8_1 :
+        type == GGML_TYPE_Q8_0 ? vec_dot_q8_0_q8_1 :
+        type == GGML_TYPE_Q2_K ? vec_dot_q2_K_q8_1 :
+        type == GGML_TYPE_Q3_K ? vec_dot_q3_K_q8_1 :
+        type == GGML_TYPE_Q4_K ? vec_dot_q4_K_q8_1 :
+        type == GGML_TYPE_Q5_K ? vec_dot_q5_K_q8_1 :
+        type == GGML_TYPE_Q6_K ? vec_dot_q6_K_q8_1 :
+        type == GGML_TYPE_IQ2_XXS ? vec_dot_iq2_xxs_q8_1 :
+        type == GGML_TYPE_IQ2_XS ? vec_dot_iq2_xs_q8_1 :
+        type == GGML_TYPE_IQ2_S ? vec_dot_iq2_s_q8_1 :
+        type == GGML_TYPE_IQ3_XXS ? vec_dot_iq3_xxs_q8_1 :
+        type == GGML_TYPE_IQ1_S ? vec_dot_iq1_s_q8_1 :
+        type == GGML_TYPE_IQ1_M ? vec_dot_iq1_m_q8_1 :
+        type == GGML_TYPE_IQ4_NL ? vec_dot_iq4_nl_q8_1 :
+        type == GGML_TYPE_IQ4_XS ? vec_dot_iq4_xs_q8_1 :
+        type == GGML_TYPE_IQ3_S ? vec_dot_iq3_s_q8_1 :
+        nullptr;
+}
+
+static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
+    return type == GGML_TYPE_Q4_0 ? VDR_Q4_0_Q8_1_MMVQ :
+        type == GGML_TYPE_Q4_1 ? VDR_Q4_1_Q8_1_MMVQ :
+        type == GGML_TYPE_Q5_0 ? VDR_Q5_0_Q8_1_MMVQ :
+        type == GGML_TYPE_Q5_1 ? VDR_Q5_1_Q8_1_MMVQ :
+        type == GGML_TYPE_Q8_0 ? VDR_Q8_0_Q8_1_MMVQ :
+        type == GGML_TYPE_Q2_K ? VDR_Q2_K_Q8_1_MMVQ :
+        type == GGML_TYPE_Q3_K ? VDR_Q3_K_Q8_1_MMVQ :
+        type == GGML_TYPE_Q4_K ? VDR_Q4_K_Q8_1_MMVQ :
+        type == GGML_TYPE_Q5_K ? VDR_Q5_K_Q8_1_MMVQ :
+        type == GGML_TYPE_Q6_K ? VDR_Q6_K_Q8_1_MMVQ :
+        type == GGML_TYPE_IQ4_NL ? VDR_Q4_K_Q8_1_MMVQ :
+        1;
+}
+
+template <ggml_type type, int ncols_y>
 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
 // tell the compiler to use as many registers as it wants, see nwarps definition below
 __launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1)
@@ -12,6 +50,12 @@ static __global__ void mul_mat_vec_q(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) {
 
+    constexpr int qk  = ggml_cuda_type_traits<type>::qk;
+    constexpr int qi  = ggml_cuda_type_traits<type>::qi;
+    constexpr int vdr = get_vdr_mmvq(type);
+
+    constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);
+
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
     constexpr int nwarps              = 1;
     constexpr int rows_per_cuda_block = 1;
@@ -29,7 +73,6 @@ static __global__ void mul_mat_vec_q(
 // partial sum for each thread
     float tmp[ncols_y][rows_per_cuda_block] = {0.0f};
 
-    const block_q_t  * x = (const block_q_t  *) vx;
     const block_q8_1 * y = (const block_q8_1 *) vy;
 
     for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) {
@@ -42,8 +85,7 @@ static __global__ void mul_mat_vec_q(
         for (int j = 0; j < ncols_y; ++j) {
 #pragma unroll
             for (int i = 0; i < rows_per_cuda_block; ++i) {
-                tmp[j][i] += vec_dot_q_cuda(
-                    &x[kbx + (row0 + i)*blocks_per_row_x], &y[j*blocks_per_col_y + kby], kqs);
+                tmp[j][i] += vec_dot_q_cuda(vx, &y[j*blocks_per_col_y + kby], (row0 + i)*blocks_per_row_x + kbx, kqs);
             }
         }
     }
@@ -81,12 +123,12 @@ static __global__ void mul_mat_vec_q(
     }
 }
 
-template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot>
+template <ggml_type type>
 static void mul_mat_vec_q_cuda(
     const void * vx, const void * vy, float * dst,
     const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
 
-    GGML_ASSERT(ncols_x % qk == 0);
+    GGML_ASSERT(ncols_x % ggml_blck_size(type) == 0);
     GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE);
 
     int id = ggml_cuda_get_device();
@@ -124,36 +166,28 @@ static void mul_mat_vec_q_cuda(
 
     switch (ncols_y) {
         case 1:
-            mul_mat_vec_q<1, qk, qi, block_q_t, vdr, vec_dot>
-                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
+            mul_mat_vec_q<type, 1><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
             break;
         case 2:
-            mul_mat_vec_q<2, qk, qi, block_q_t, vdr, vec_dot>
-                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
+            mul_mat_vec_q<type, 2><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
             break;
         case 3:
-            mul_mat_vec_q<3, qk, qi, block_q_t, vdr, vec_dot>
-                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
+            mul_mat_vec_q<type, 3><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
             break;
         case 4:
-            mul_mat_vec_q<4, qk, qi, block_q_t, vdr, vec_dot>
-                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
+            mul_mat_vec_q<type, 4><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
             break;
         case 5:
-            mul_mat_vec_q<5, qk, qi, block_q_t, vdr, vec_dot>
-                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
+            mul_mat_vec_q<type, 5><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
             break;
         case 6:
-            mul_mat_vec_q<6, qk, qi, block_q_t, vdr, vec_dot>
-                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
+            mul_mat_vec_q<type, 6><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
             break;
         case 7:
-            mul_mat_vec_q<7, qk, qi, block_q_t, vdr, vec_dot>
-                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
+            mul_mat_vec_q<type, 7><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
             break;
         case 8:
-            mul_mat_vec_q<8, qk, qi, block_q_t, vdr, vec_dot>
-                <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
+            mul_mat_vec_q<type, 8><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
             break;
         default:
             GGML_ASSERT(false);
@@ -165,152 +199,133 @@ static void mul_mat_vec_q4_0_q8_1_cuda(
     const void * vx, const void * vy, float * dst,
     const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
 
-    mul_mat_vec_q_cuda<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
-        (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+    mul_mat_vec_q_cuda<GGML_TYPE_Q4_0>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 
 static void mul_mat_vec_q4_1_q8_1_cuda(
     const void * vx, const void * vy, float * dst,
     const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
 
-    mul_mat_vec_q_cuda<QK4_1, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
-        (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+    mul_mat_vec_q_cuda<GGML_TYPE_Q4_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 
 static void mul_mat_vec_q5_0_q8_1_cuda(
     const void * vx, const void * vy, float * dst,
     const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
 
-    mul_mat_vec_q_cuda<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
-        (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+    mul_mat_vec_q_cuda<GGML_TYPE_Q5_0>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 
 static void mul_mat_vec_q5_1_q8_1_cuda(
     const void * vx, const void * vy, float * dst,
     const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
 
-    mul_mat_vec_q_cuda<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
-        (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+    mul_mat_vec_q_cuda<GGML_TYPE_Q5_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 
 static void mul_mat_vec_q8_0_q8_1_cuda(
     const void * vx, const void * vy, float * dst,
     const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
 
-    mul_mat_vec_q_cuda<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
-        (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+    mul_mat_vec_q_cuda<GGML_TYPE_Q8_0>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 
 static void mul_mat_vec_q2_K_q8_1_cuda(
     const void * vx, const void * vy, float * dst,
     const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
 
-    mul_mat_vec_q_cuda<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
-        (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+    mul_mat_vec_q_cuda<GGML_TYPE_Q2_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 
 static void mul_mat_vec_q3_K_q8_1_cuda(
     const void * vx, const void * vy, float * dst,
     const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
 
-    mul_mat_vec_q_cuda<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
-        (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+    mul_mat_vec_q_cuda<GGML_TYPE_Q3_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 
 static void mul_mat_vec_q4_K_q8_1_cuda(
     const void * vx, const void * vy, float * dst,
     const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
 
-    mul_mat_vec_q_cuda<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
-        (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+    mul_mat_vec_q_cuda<GGML_TYPE_Q4_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 
 static void mul_mat_vec_q5_K_q8_1_cuda(
     const void * vx, const void * vy, float * dst,
     const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
 
-    mul_mat_vec_q_cuda<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
-        (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+    mul_mat_vec_q_cuda<GGML_TYPE_Q5_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 
 static void mul_mat_vec_q6_K_q8_1_cuda(
     const void * vx, const void * vy, float * dst,
     const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
 
-    mul_mat_vec_q_cuda<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
-        (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+    mul_mat_vec_q_cuda<GGML_TYPE_Q6_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 
 static void mul_mat_vec_iq2_xxs_q8_1_cuda(
     const void * vx, const void * vy, float * dst,
     const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
 
-    mul_mat_vec_q_cuda<QK_K, QI2_XXS, block_iq2_xxs, 1, vec_dot_iq2_xxs_q8_1>
-        (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+    mul_mat_vec_q_cuda<GGML_TYPE_IQ2_XXS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 
 static void mul_mat_vec_iq2_xs_q8_1_cuda(
     const void * vx, const void * vy, float * dst,
     const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
 
-    mul_mat_vec_q_cuda<QK_K, QI2_XS, block_iq2_xs, 1, vec_dot_iq2_xs_q8_1>
-        (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+    mul_mat_vec_q_cuda<GGML_TYPE_IQ2_XS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 
 static void mul_mat_vec_iq2_s_q8_1_cuda(
     const void * vx, const void * vy, float * dst,
     const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
 
-    mul_mat_vec_q_cuda<QK_K, QI2_S, block_iq2_s, 1, vec_dot_iq2_s_q8_1>
-        (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+    mul_mat_vec_q_cuda<GGML_TYPE_IQ2_S>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 
 static void mul_mat_vec_iq3_xxs_q8_1_cuda(
     const void * vx, const void * vy, float * dst,
     const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
 
-    mul_mat_vec_q_cuda<QK_K, QI3_XXS, block_iq3_xxs, 1, vec_dot_iq3_xxs_q8_1>
-        (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+    mul_mat_vec_q_cuda<GGML_TYPE_IQ3_XXS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 
 static void mul_mat_vec_iq1_s_q8_1_cuda(
     const void * vx, const void * vy, float * dst,
     const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
 
-    mul_mat_vec_q_cuda<QK_K, QI1_S, block_iq1_s, 1, vec_dot_iq1_s_q8_1>
-        (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+    mul_mat_vec_q_cuda<GGML_TYPE_IQ1_S>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 
 static void mul_mat_vec_iq1_m_q8_1_cuda(
     const void * vx, const void * vy, float * dst,
     const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
 
-    mul_mat_vec_q_cuda<QK_K, QI1_S, block_iq1_m, 1, vec_dot_iq1_m_q8_1>
-        (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+    mul_mat_vec_q_cuda<GGML_TYPE_IQ1_M>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 
 static void mul_mat_vec_iq4_nl_q8_1_cuda(
     const void * vx, const void * vy, float * dst,
     const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
 
-    mul_mat_vec_q_cuda<QK4_NL, QI4_NL, block_iq4_nl, VDR_Q4_0_Q8_1_MMVQ, vec_dot_iq4_nl_q8_1>
-        (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+    mul_mat_vec_q_cuda<GGML_TYPE_IQ4_NL>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 
 static void mul_mat_vec_iq4_xs_q8_1_cuda(
     const void * vx, const void * vy, float * dst,
     const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
 
-    mul_mat_vec_q_cuda<QK_K, QI4_XS, block_iq4_xs, 1, vec_dot_iq4_xs_q8_1>
-        (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+    mul_mat_vec_q_cuda<GGML_TYPE_IQ4_XS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 
 static void mul_mat_vec_iq3_s_q8_1_cuda(
     const void * vx, const void * vy, float * dst,
     const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
 
-    mul_mat_vec_q_cuda<QK_K, QI3_XS, block_iq3_s, 1, vec_dot_iq3_s_q8_1>
-        (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
+    mul_mat_vec_q_cuda<GGML_TYPE_IQ3_S>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
 }
 
 void ggml_cuda_op_mul_mat_vec_q(
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu
index d7f103475..6696a2384 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-f16.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu
index f3d8d2eda..dd070db28 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu
index 9beb05ca2..54dcde6f5 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q4_1.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu
index 0c163dcba..4ec22f791 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu
index 3980167b3..3c15bf7f0 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q5_1.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu
index fe099921d..7e61b5fdc 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-f16-q8_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu
index d4d5e7999..fdb15b580 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-f16.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
index f08b10c4d..0f7c417d2 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu
index e8c3f8adc..851f33c43 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q4_1.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu
index c01416a13..763809cbe 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu
index 46615f281..f2a276e50 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q5_1.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu
index 72dcc1a2f..cb227f6f5 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_0-q8_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu
index 9fa8a377d..97ac0520c 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-f16.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu
index 20ea86c6d..c772b4263 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu
index ed815957c..5cb743081 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q4_1.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu
index bbe9e6a1c..98a709d17 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu
index d12a61699..4f2f947ae 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q5_1.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu
index 1e901afcb..11f96b6f6 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q4_1-q8_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu
index a3f98ce37..b39bdc061 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-f16.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu
index 1bae97243..bbd6a2c7f 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu
index 7258e9775..9d84ff2b1 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q4_1.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu
index 08435c005..bc8a5bff6 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu
index 17864e8e9..a679100c8 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q5_1.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu
index 9239138c9..8f21bccf7 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_0-q8_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu
index e387d9c1d..858b00fd7 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-f16.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu
index d69d3bbd6..0fc8011fa 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu
index 61a478816..261fdf623 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q4_1.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu
index 89995080a..0fb824738 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu
index 9e6a58dff..a9d9d089b 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q5_1.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu
index 153cbfd86..7d7b27920 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q5_1-q8_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu
index 09d576558..a092ee2d5 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-f16.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu
index 3e3c91e68..db55927a1 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu
index 7b973058f..c3c21cefa 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q4_1.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu
index a43a475d4..35dd9f520 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu
index 5b570c0a3..050c22ac7 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q5_1.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu
index bf2cc684e..de4866c5e 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs128-q8_0-q8_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu
index 7428e45ea..57a10bc4b 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs256-f16-f16.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu
index 4aee830de..e0f08b46a 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-f16.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu
index 36acb6319..1c8e8a467 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu
index a4090c390..cefed83fb 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q4_1.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu
index 17b6b2d11..aede6e358 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu
index 549e1cea1..1a1a92c78 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q5_1.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu
index 66bcd820f..ad667473d 100644
--- a/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f16-instance-hs64-f16-q8_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu
index 15933a299..c499f455d 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-f16.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu
index 8aa785583..8286ebf37 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu
index bde3924fd..458786882 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q4_1.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu
index 1708181c1..d89103ce0 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu
index 30fa6fa4c..bb75fd42f 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q5_1.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu
index 69673d50f..b1629817e 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-f16-q8_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu
index d8b2b2e18..d8657604d 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-f16.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu
index 01cce7ab5..2e5bd2f1a 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu
index fd5563b39..be5f302d9 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q4_1.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu
index b13cc4a0c..8dd91cd72 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu
index 86f1fc637..4cb791502 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q5_1.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu
index 26e7df4be..09dea4267 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_0-q8_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu
index e4fda8952..0fbb60769 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-f16.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu
index bd15117b4..2aeab83b2 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu
index cb6c6a760..599415b49 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q4_1.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu
index 201b6641d..e4f8e3083 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu
index 6da57a44a..34d166527 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q5_1.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu
index 47623c9bf..4bebef45a 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q4_1-q8_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu
index 82c6861d2..326468da2 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-f16.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu
index 24a80c2b0..511b58f4e 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu
index b95eaf7e1..d9906d142 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q4_1.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu
index 275f2efcc..f61c183ab 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu
index 3673f7fd5..c10450fd2 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q5_1.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu
index 2c4d59947..2d5cb195c 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_0-q8_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu
index 2457cdf3f..b384f34d7 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-f16.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu
index b3b411ed3..446e293b1 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu
index b7f308a4d..6f4302988 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q4_1.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu
index 739686697..1cd8ba88f 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu
index 708d03113..1ee2eab65 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q5_1.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu
index df891be60..2bc77816a 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q5_1-q8_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu
index f49b6d1f9..d55ced08b 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-f16.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu
index 1de92148b..8361e99c4 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu
index 7a1ba7f8d..7507a67c4 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q4_1.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu
index 25493e4ba..61f050b23 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu
index 3cd650c7b..d4a49d9c9 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q5_1.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu
index 88ffa43d6..d14627897 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs128-q8_0-q8_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu
index 8c7bac6c2..e73f917a1 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs256-f16-f16.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu
index a28f62e7b..d40825dfc 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-f16.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu
index d39838b96..b5c6869f4 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu
index 834d40f6c..4e21b0cca 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q4_1.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu
index f7d54668b..2eac321b3 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu
index 59e00ad83..f7d2c3b4e 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q5_1.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu
index 6e63893de..a013f400b 100644
--- a/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu
+++ b/ggml-cuda/template-instances/fattn-vec-f32-instance-hs64-f16-q8_0.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-vec-f32.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu b/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu
index ca356ad6c..2d94e65c2 100644
--- a/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu
+++ b/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb16.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-wmma-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu b/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu
index 430ee64eb..c3d9df3c4 100644
--- a/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu
+++ b/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqfloat-cpb32.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-wmma-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu b/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu
index d421d17cc..bb680e401 100644
--- a/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu
+++ b/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb16.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-wmma-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu b/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu
index deacd5f58..073f71b1f 100644
--- a/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu
+++ b/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb32.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-wmma-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu b/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu
index 282896733..d30710c5f 100644
--- a/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu
+++ b/ggml-cuda/template-instances/fattn-wmma-f16-instance-kqhalf-cpb8.cu
@@ -1,4 +1,4 @@
-// This file has been autogenerated by generate-variants.py, do not edit manually.
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
 
 #include "../fattn-wmma-f16.cuh"
 
diff --git a/ggml-cuda/template-instances/generate_cu_files.py b/ggml-cuda/template-instances/generate_cu_files.py
index ee5b460e0..ea58d0968 100755
--- a/ggml-cuda/template-instances/generate_cu_files.py
+++ b/ggml-cuda/template-instances/generate_cu_files.py
@@ -20,6 +20,18 @@ SOURCE_FATTN_WMMA_START = """// This file has been autogenerated by generate_cu_
 
 SOURCE_FATTN_WMMA_CASE = "DECL_FATTN_WMMA_F16_CASE({head_size}, {cols_per_block}, {kq_acc_t});\n"
 
+TYPES_MMQ = [
+    "GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0",
+    "GGML_TYPE_Q2_K", "GGML_TYPE_Q3_K", "GGML_TYPE_Q4_K", "GGML_TYPE_Q5_K", "GGML_TYPE_Q6_K"
+]
+
+SOURCE_MMQ = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE({type});
+"""
+
 
 def get_short_name(long_quant_name):
     return long_quant_name.replace("GGML_TYPE_", "").lower()
@@ -57,3 +69,7 @@ for kq_acc_t in ["half", "float"]:
                 if kq_acc_t == "float" and cols_per_block == 32 and head_size == 256: # register spilling, bad performance
                     continue
                 f.write(SOURCE_FATTN_WMMA_CASE.format(kq_acc_t=kq_acc_t, cols_per_block=cols_per_block, head_size=head_size))
+
+for type in TYPES_MMQ:
+    with open(f"mmq-instance-{get_short_name(type)}.cu", "w") as f:
+        f.write(SOURCE_MMQ.format(type=type))
diff --git a/ggml-cuda/template-instances/mmq-instance-q2_k.cu b/ggml-cuda/template-instances/mmq-instance-q2_k.cu
new file mode 100644
index 000000000..6415369dc
--- /dev/null
+++ b/ggml-cuda/template-instances/mmq-instance-q2_k.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_Q2_K);
diff --git a/ggml-cuda/template-instances/mmq-instance-q3_k.cu b/ggml-cuda/template-instances/mmq-instance-q3_k.cu
new file mode 100644
index 000000000..ffb6213af
--- /dev/null
+++ b/ggml-cuda/template-instances/mmq-instance-q3_k.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_Q3_K);
diff --git a/ggml-cuda/template-instances/mmq-instance-q4_0.cu b/ggml-cuda/template-instances/mmq-instance-q4_0.cu
new file mode 100644
index 000000000..0c0b0c8a8
--- /dev/null
+++ b/ggml-cuda/template-instances/mmq-instance-q4_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_Q4_0);
diff --git a/ggml-cuda/template-instances/mmq-instance-q4_1.cu b/ggml-cuda/template-instances/mmq-instance-q4_1.cu
new file mode 100644
index 000000000..ee67f6942
--- /dev/null
+++ b/ggml-cuda/template-instances/mmq-instance-q4_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_Q4_1);
diff --git a/ggml-cuda/template-instances/mmq-instance-q4_k.cu b/ggml-cuda/template-instances/mmq-instance-q4_k.cu
new file mode 100644
index 000000000..9eeb3cd7f
--- /dev/null
+++ b/ggml-cuda/template-instances/mmq-instance-q4_k.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_Q4_K);
diff --git a/ggml-cuda/template-instances/mmq-instance-q5_0.cu b/ggml-cuda/template-instances/mmq-instance-q5_0.cu
new file mode 100644
index 000000000..cc57fb975
--- /dev/null
+++ b/ggml-cuda/template-instances/mmq-instance-q5_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_Q5_0);
diff --git a/ggml-cuda/template-instances/mmq-instance-q5_1.cu b/ggml-cuda/template-instances/mmq-instance-q5_1.cu
new file mode 100644
index 000000000..721ac790c
--- /dev/null
+++ b/ggml-cuda/template-instances/mmq-instance-q5_1.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_Q5_1);
diff --git a/ggml-cuda/template-instances/mmq-instance-q5_k.cu b/ggml-cuda/template-instances/mmq-instance-q5_k.cu
new file mode 100644
index 000000000..a2e90ffd5
--- /dev/null
+++ b/ggml-cuda/template-instances/mmq-instance-q5_k.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_Q5_K);
diff --git a/ggml-cuda/template-instances/mmq-instance-q6_k.cu b/ggml-cuda/template-instances/mmq-instance-q6_k.cu
new file mode 100644
index 000000000..470938fef
--- /dev/null
+++ b/ggml-cuda/template-instances/mmq-instance-q6_k.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_Q6_K);
diff --git a/ggml-cuda/template-instances/mmq-instance-q8_0.cu b/ggml-cuda/template-instances/mmq-instance-q8_0.cu
new file mode 100644
index 000000000..974477bbb
--- /dev/null
+++ b/ggml-cuda/template-instances/mmq-instance-q8_0.cu
@@ -0,0 +1,5 @@
+// This file has been autogenerated by generate_cu_files.py, do not edit manually.
+
+#include "../mmq.cuh"
+
+DECL_MMQ_CASE(GGML_TYPE_Q8_0);
diff --git a/ggml-cuda/vecdotq.cuh b/ggml-cuda/vecdotq.cuh
index df9752390..b9573a7c7 100644
--- a/ggml-cuda/vecdotq.cuh
+++ b/ggml-cuda/vecdotq.cuh
@@ -566,9 +566,9 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
 }
 
 static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
 
-    const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
+    const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq + kbx;
 
     int v[VDR_Q4_0_Q8_1_MMVQ];
     int u[2*VDR_Q4_0_Q8_1_MMVQ];
@@ -585,9 +585,9 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
 
 
 static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
 
-    const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
+    const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq + kbx;
 
     int v[VDR_Q4_1_Q8_1_MMVQ];
     int u[2*VDR_Q4_1_Q8_1_MMVQ];
@@ -603,9 +603,9 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
 }
 
 static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
 
-    const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
+    const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq + kbx;
 
     int vl[VDR_Q5_0_Q8_1_MMVQ];
     int vh[VDR_Q5_0_Q8_1_MMVQ];
@@ -623,9 +623,9 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
 }
 
 static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
 
-    const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
+    const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq + kbx;
 
     int vl[VDR_Q5_1_Q8_1_MMVQ];
     int vh[VDR_Q5_1_Q8_1_MMVQ];
@@ -643,9 +643,9 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
 }
 
 static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
 
-    const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
+    const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq + kbx;
 
     int v[VDR_Q8_0_Q8_1_MMVQ];
     int u[VDR_Q8_0_Q8_1_MMVQ];
@@ -660,9 +660,9 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
 }
 
 static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
 
-    const block_q2_K * bq2_K = (const block_q2_K *) vbq;
+    const block_q2_K * bq2_K = (const block_q2_K *) vbq + kbx;
 
     const int bq8_offset = QR2_K * (iqs / QI8_1);
     const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
@@ -683,9 +683,9 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
 }
 
 static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
 
-    const block_q3_K * bq3_K = (const block_q3_K *) vbq;
+    const block_q3_K * bq3_K = (const block_q3_K *) vbq + kbx;
 
     const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
     const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
@@ -710,9 +710,9 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
 }
 
 static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
 
-    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
+    const block_q4_K * bq4_K = (const block_q4_K *) vbq + kbx;
 
     int    v[2];
     int    u[2*QR4_K];
@@ -756,9 +756,9 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
 }
 
 static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
 
-    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
+    const block_q5_K * bq5_K = (const block_q5_K *) vbq + kbx;
 
     int   vl[2];
     int   vh[2];
@@ -802,9 +802,9 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
 }
 
 static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
 
-    const block_q6_K * bq6_K = (const block_q6_K *) vbq;
+    const block_q6_K * bq6_K = (const block_q6_K *) vbq + kbx;
 
     const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
     const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
@@ -828,8 +828,8 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
 }
 
 static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-    const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq;
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
+    const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq + kbx;
 
 #if QR2_XXS == 8
     const int ib32 = iqs;
@@ -872,9 +872,9 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
 }
 
 static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
 #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq;
+    const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq + kbx;
 
     const int ib32 = iqs;
     const uint16_t * q2 = bq2->qs + 4*ib32;
@@ -911,9 +911,9 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
 
 // TODO
 static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
 #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    const block_iq2_s * bq2 = (const block_iq2_s *) vbq;
+    const block_iq2_s * bq2 = (const block_iq2_s *) vbq + kbx;
 
     const int ib32 = iqs;
     const int8_t  * q8 = bq8_1[ib32].qs;
@@ -951,9 +951,9 @@ static __device__ __forceinline__ float vec_dot_iq2_s_q8_1(
 }
 
 static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
 #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq;
+    const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq + kbx;
 
     const int ib32 = iqs;
     const uint8_t  * q3 = bq2->qs + 8*ib32;
@@ -981,9 +981,9 @@ static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1(
 
 // TODO: don't use lookup table for signs
 static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
 #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
+    const block_iq3_s * bq2 = (const block_iq3_s *) vbq + kbx;
 
     const int ib32 = iqs;
     const uint8_t  * qs = bq2->qs + 8*ib32;
@@ -1008,8 +1008,8 @@ static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
 }
 
 static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-    const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
+    const block_iq1_s * bq1 = (const block_iq1_s *) vbq + kbx;
 
     const int ib32 = iqs;
     int sumi = 0;
@@ -1039,8 +1039,8 @@ static __device__ __forceinline__ float vec_dot_iq1_s_q8_1(
 }
 
 static __device__ __forceinline__ float vec_dot_iq1_m_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
-    const block_iq1_m * bq1 = (const block_iq1_m *) vbq;
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
+    const block_iq1_m * bq1 = (const block_iq1_m *) vbq + kbx;
 
     const int ib32 = iqs;
     int   sumi[2] = {0, 0};
@@ -1094,9 +1094,9 @@ static __device__ __forceinline__ void get_int_from_table_16(const uint32_t & q4
 #endif
 
 static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
 
-    const block_iq4_nl * bq = (const block_iq4_nl *) vbq;
+    const block_iq4_nl * bq = (const block_iq4_nl *) vbq + kbx;
 
 #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
     const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs;
@@ -1128,10 +1128,10 @@ static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1(
 }
 
 static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs) {
 
 #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq;
+    const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq + kbx;
     const uint8_t * values = (const uint8_t *)kvalues_iq4nl;
 
     // iqs is 0...7
@@ -1149,6 +1149,6 @@ static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1(
     }
     return d * (sumi1 + sumi2);
 #else
-    return vec_dot_iq4_xs_q8_1(vbq, bq8_1, iqs);
+    return vec_dot_iq4_xs_q8_1(vbq, bq8_1, kbx, iqs);
 #endif
 }

From 7672adeec7a79ea271058c63106c142ba84f951a Mon Sep 17 00:00:00 2001
From: Galunid <karolek1231456@gmail.com>
Date: Wed, 5 Jun 2024 19:07:24 +0200
Subject: [PATCH 13/39] Fix encoding in python scripts (#7733)

---
 convert-hf-to-gguf-update.py | 1 +
 convert-hf-to-gguf.py        | 1 +
 2 files changed, 2 insertions(+)

diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py
index 84b72348d..6dae1a594 100755
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python3
+# -*- coding: utf-8 -*-
 
 # This script downloads the tokenizer models of the specified models from Huggingface and
 # generates the get_vocab_base_pre() function for convert-hf-to-gguf.py
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index ad071b974..afb9704c8 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python3
+# -*- coding: utf-8 -*-
 
 from __future__ import annotations
 

From d67caea0d6e6c303d31b01d0a010973e6c908dff Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Thu, 6 Jun 2024 07:17:21 +0200
Subject: [PATCH 14/39] docker : add openmp lib (#7780)

---
 .devops/full-cuda.Dockerfile   | 2 +-
 .devops/full.Dockerfile        | 2 +-
 .devops/main-cuda.Dockerfile   | 3 +++
 .devops/main-vulkan.Dockerfile | 2 +-
 .devops/main.Dockerfile        | 3 +++
 .devops/server-cuda.Dockerfile | 2 +-
 .devops/server.Dockerfile      | 2 +-
 7 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/.devops/full-cuda.Dockerfile b/.devops/full-cuda.Dockerfile
index c01006efe..f6073f662 100644
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
 ARG CUDA_DOCKER_ARCH=all
 
 RUN apt-get update && \
-    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev
+    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
 
 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
diff --git a/.devops/full.Dockerfile b/.devops/full.Dockerfile
index 6d5943a2f..6f19afa9c 100644
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION as build
 
 RUN apt-get update && \
-    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev
+    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
 
 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
diff --git a/.devops/main-cuda.Dockerfile b/.devops/main-cuda.Dockerfile
index 23f428944..5bcd45fe8 100644
--- a/.devops/main-cuda.Dockerfile
+++ b/.devops/main-cuda.Dockerfile
@@ -27,6 +27,9 @@ RUN make -j$(nproc)
 
 FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
 
+RUN apt-get update && \
+    apt-get install -y libgomp1
+
 COPY --from=build /app/main /main
 
 ENTRYPOINT [ "/main" ]
diff --git a/.devops/main-vulkan.Dockerfile b/.devops/main-vulkan.Dockerfile
index 6c2b2ed5b..1bdb52803 100644
--- a/.devops/main-vulkan.Dockerfile
+++ b/.devops/main-vulkan.Dockerfile
@@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=jammy
 FROM ubuntu:$UBUNTU_VERSION as build
 
 # Install build tools
-RUN apt update && apt install -y git build-essential cmake wget
+RUN apt update && apt install -y git build-essential cmake wget libgomp1
 
 # Install Vulkan SDK
 RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
diff --git a/.devops/main.Dockerfile b/.devops/main.Dockerfile
index 763d75fce..98a58a4b9 100644
--- a/.devops/main.Dockerfile
+++ b/.devops/main.Dockerfile
@@ -13,6 +13,9 @@ RUN make -j$(nproc)
 
 FROM ubuntu:$UBUNTU_VERSION as runtime
 
+RUN apt-get update && \
+    apt-get install -y libgomp1
+
 COPY --from=build /app/main /main
 
 ENV LC_ALL=C.utf8
diff --git a/.devops/server-cuda.Dockerfile b/.devops/server-cuda.Dockerfile
index 7f5228185..2532e69e8 100644
--- a/.devops/server-cuda.Dockerfile
+++ b/.devops/server-cuda.Dockerfile
@@ -30,7 +30,7 @@ RUN make -j$(nproc)
 FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
 
 RUN apt-get update && \
-    apt-get install -y libcurl4-openssl-dev
+    apt-get install -y libcurl4-openssl-dev libgomp1
 
 COPY --from=build /app/server /server
 
diff --git a/.devops/server.Dockerfile b/.devops/server.Dockerfile
index 0d09d3627..a41c16b65 100644
--- a/.devops/server.Dockerfile
+++ b/.devops/server.Dockerfile
@@ -16,7 +16,7 @@ RUN make -j$(nproc)
 FROM ubuntu:$UBUNTU_VERSION as runtime
 
 RUN apt-get update && \
-    apt-get install -y libcurl4-openssl-dev
+    apt-get install -y libcurl4-openssl-dev libgomp1
 
 COPY --from=build /app/server /server
 

From 2d08b7fbb483c14bd2b173d4cd51ea3a4f862e8f Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Thu, 6 Jun 2024 07:19:49 +0200
Subject: [PATCH 15/39] docker : build only main and server in their images
 (#7782)

* add openmp lib to dockerfiles

* build only main and server in their docker images
---
 .devops/main-cuda.Dockerfile   | 2 +-
 .devops/main-rocm.Dockerfile   | 2 +-
 .devops/main.Dockerfile        | 2 +-
 .devops/server-cuda.Dockerfile | 2 +-
 .devops/server.Dockerfile      | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.devops/main-cuda.Dockerfile b/.devops/main-cuda.Dockerfile
index 5bcd45fe8..2aec4a85d 100644
--- a/.devops/main-cuda.Dockerfile
+++ b/.devops/main-cuda.Dockerfile
@@ -23,7 +23,7 @@ ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 # Enable CUDA
 ENV LLAMA_CUDA=1
 
-RUN make -j$(nproc)
+RUN make -j$(nproc) main
 
 FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
 
diff --git a/.devops/main-rocm.Dockerfile b/.devops/main-rocm.Dockerfile
index 37576d68e..dcaeb3e72 100644
--- a/.devops/main-rocm.Dockerfile
+++ b/.devops/main-rocm.Dockerfile
@@ -40,6 +40,6 @@ ENV LLAMA_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++
 
-RUN make -j$(nproc)
+RUN make -j$(nproc) main
 
 ENTRYPOINT [ "/app/main" ]
diff --git a/.devops/main.Dockerfile b/.devops/main.Dockerfile
index 98a58a4b9..d2514c4ba 100644
--- a/.devops/main.Dockerfile
+++ b/.devops/main.Dockerfile
@@ -9,7 +9,7 @@ WORKDIR /app
 
 COPY . .
 
-RUN make -j$(nproc)
+RUN make -j$(nproc) main
 
 FROM ubuntu:$UBUNTU_VERSION as runtime
 
diff --git a/.devops/server-cuda.Dockerfile b/.devops/server-cuda.Dockerfile
index 2532e69e8..4e9747b82 100644
--- a/.devops/server-cuda.Dockerfile
+++ b/.devops/server-cuda.Dockerfile
@@ -25,7 +25,7 @@ ENV LLAMA_CUDA=1
 # Enable cURL
 ENV LLAMA_CURL=1
 
-RUN make -j$(nproc)
+RUN make -j$(nproc) server
 
 FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
 
diff --git a/.devops/server.Dockerfile b/.devops/server.Dockerfile
index a41c16b65..bee63b966 100644
--- a/.devops/server.Dockerfile
+++ b/.devops/server.Dockerfile
@@ -11,7 +11,7 @@ COPY . .
 
 ENV LLAMA_CURL=1
 
-RUN make -j$(nproc)
+RUN make -j$(nproc) server
 
 FROM ubuntu:$UBUNTU_VERSION as runtime
 

From f5d7b268ec4bf8628aa6ccc9f6631d0230dde76f Mon Sep 17 00:00:00 2001
From: Joan Fontanals <joan.fontanals.martinez@jina.ai>
Date: Thu, 6 Jun 2024 09:22:41 +0200
Subject: [PATCH 16/39] llama : add jina v2 base code (#7596)

* feat: add changes to handle jina v2 base code

* fix: do not complicate things

* fix: fix the usage of the code model

* fix: fix comments

* fix: fix linting issues

* fix: remove ollama patches

* style : minor

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 convert-hf-to-gguf-update.py   |  1 +
 convert-hf-to-gguf.py          |  7 ++++++-
 gguf-py/gguf/constants.py      |  1 +
 gguf-py/gguf/tensor_mapping.py |  3 +++
 llama.cpp                      | 17 +++++++++++++----
 5 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py
index 6dae1a594..f43b15760 100755
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@@ -83,6 +83,7 @@ models = [
     {"name": "jina-v2-es",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
     {"name": "jina-v2-de",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
     {"name": "smaug-bpe",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
+    {"name": "jina-v2-code",   "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
 ]
 
 
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index afb9704c8..a86864f04 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -475,6 +475,9 @@ class Model:
         if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
             # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
             res = "smaug-bpe"
+        if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
+            # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
+            res = "jina-v2-code"
 
         if res is None:
             logger.warning("\n")
@@ -2452,11 +2455,13 @@ class JinaBertV2Model(BertModel):
 
     def get_tensors(self):
         for name, data in super().get_tensors():
-            if 'gated_layers' in name:
+            if 'gated_layer' in name:
                 d1 = data[:self.intermediate_size, :]
                 name1 = name.replace('gated_layers', 'gated_layers_w')
+                name1 = name1.replace('up_gated_layer', 'gated_layers_v')
                 d2 = data[self.intermediate_size:, :]
                 name2 = name.replace('gated_layers', 'gated_layers_v')
+                name2 = name2.replace('up_gated_layer', 'gated_layers_w')
                 yield name1, d1
                 yield name2, d2
                 continue
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index a3c024c89..8908585cc 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -415,6 +415,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.TOKEN_EMBD_NORM,
         MODEL_TENSOR.TOKEN_TYPES,
+        MODEL_TENSOR.ATTN_NORM_2,
         MODEL_TENSOR.ATTN_OUT_NORM,
         MODEL_TENSOR.ATTN_Q,
         MODEL_TENSOR.ATTN_Q_NORM,
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 83e3c4c33..81b4992a5 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -102,6 +102,7 @@ class TensorNameMap:
         # Attention norm 2
         MODEL_TENSOR.ATTN_NORM_2: (
             "transformer.h.{bid}.ln_attn",  # falcon40b
+            "encoder.layer.{bid}.layer_norm_1",             # jina-v2-code
         ),
 
         # Attention query-key-value
@@ -311,6 +312,7 @@ class TensorNameMap:
             "model.layers.{bid}.mlp.c_proj",                          # starcoder2
             "encoder.layer.{bid}.mlp.wo",                             # jina-bert-v2
             "model.layers.{bid}.residual_mlp.w2",                     # arctic
+            "encoder.layer.{bid}.mlp.down_layer",                     # jina-bert-v2
         ),
 
         MODEL_TENSOR.FFN_DOWN_EXP: (
@@ -350,6 +352,7 @@ class TensorNameMap:
             "encoder.layers.{bid}.norm2",                   # nomic-bert
             "transformer.decoder_layer.{bid}.rms_norm_3",   # Grok
             "encoder.layer.{bid}.mlp.layernorm",            # jina-bert-v2
+            "encoder.layer.{bid}.layer_norm_2"              # jina-v2-code
         ),
 
         MODEL_TENSOR.SSM_IN: (
diff --git a/llama.cpp b/llama.cpp
index 414d390e8..cefb4d1d5 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -704,6 +704,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
             { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
             { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
             { LLM_TENSOR_TOKEN_TYPES,     "token_types" },
+            { LLM_TENSOR_ATTN_NORM_2,     "blk.%d.attn_norm_2" },
             { LLM_TENSOR_ATTN_OUT_NORM,   "blk.%d.attn_output_norm" },
             { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
             { LLM_TENSOR_ATTN_Q_NORM,     "blk.%d.attn_q_norm" },
@@ -4653,8 +4654,7 @@ static void llm_load_vocab(
                 LLAMA_LOG_WARN("%s: ************************************        \n", __func__);
                 LLAMA_LOG_WARN("%s:                                             \n", __func__);
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
-            } else if (
-                    tokenizer_pre == "default") {
+            } else if (tokenizer_pre == "default") {
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
             } else if (
                     tokenizer_pre == "llama3"   ||
@@ -4681,7 +4681,8 @@ static void llm_load_vocab(
                     tokenizer_pre == "jina-es" ||
                     tokenizer_pre == "jina-de" ||
                     tokenizer_pre == "jina-v2-es" ||
-                    tokenizer_pre == "jina-v2-de") {
+                    tokenizer_pre == "jina-v2-de" ||
+                    tokenizer_pre == "jina-v2-code") {
                 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
             } else if (
                     tokenizer_pre == "refact") {
@@ -5515,7 +5516,7 @@ static bool llm_load_tensors(
 
                             layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i),   {n_embd});
                         } else {
-                            layer.ffn_gate   = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd,   n_ff});
+                            layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
                         }
 
                         layer.layer_out_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd});
@@ -5556,6 +5557,9 @@ static bool llm_load_tensors(
                         layer.attn_out_norm   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); //output_norm
                         layer.attn_out_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i),   {n_embd});
 
+                        layer.attn_norm_2   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i),   {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+
                         layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP,        "weight", i), {n_embd, n_ff});
                         layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE,    "weight", i), {n_embd, n_ff});
 
@@ -8519,6 +8523,11 @@ struct llm_build_context {
             // attention layer norm
             cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_out_norm, model.layers[il].attn_out_norm_b, LLM_NORM, cb, il);
 
+            if (model.layers[il].attn_norm_2 != nullptr) {
+                cur = ggml_add(ctx0, cur, inpL); // re-add the layer input
+                cur = llm_build_norm(ctx0, cur, hparams, model.layers[il].attn_norm_2, model.layers[il].attn_norm_2_b, LLM_NORM, cb, il);
+            }
+
             struct ggml_tensor * ffn_inp = cur;
             cb(ffn_inp, "ffn_inp", il);
 

From 55b2d0849d3ec9e45e4a4d9e480f5fa7977872a6 Mon Sep 17 00:00:00 2001
From: Olivier Chafik <ochafik@users.noreply.github.com>
Date: Thu, 6 Jun 2024 10:07:06 +0100
Subject: [PATCH 17/39] grammars: x{min,max} repetition operator (#6640)

* grammars: x{min,max} repetition operator + tweak +/*/? to avoid duplication of original over alternates

* grammars: handle `x{n}` and fix `x{n,n}`

* grammars: document new repetition operators

* grammars: uniform use of int for min & max

* grammars: refactor parser test

* grammar: parsing tests w/ natural pretty print of updated expectations

* grammars: much prettier print of expectations (+ TEST_GRAMMAR_PARSER_PRINT_ALL=1 to force all)

* grammars: improve test pretty print again

* grammars: pretty print rules and chars

* grammars: fix copy rule skipping

* grammars: disallow `a{,}` (not allowed in regexps)

* Update common/grammar-parser.cpp

Co-authored-by: Clint Herron <hanclinto@gmail.com>

* grammars: fix copy rule skipping (again) & display of expectations

* grammars: more test cases

* grammars: update reps parsing to bring ? / * / + closer to before

* json: use new GBNF repetitions{m,n} syntax

* grammars: update performance gotchas w/ repetition advice

* Update examples/json_schema_to_grammar.py

Co-authored-by: Clint Herron <hanclinto@gmail.com>

* Update examples/server/public/json-schema-to-grammar.mjs

Co-authored-by: Clint Herron <hanclinto@gmail.com>

* grammars: comment on rule repetitions

* grammars: ensure unambiguous number alternatives

* grammar: nit typo switched error msgs

* grammar: nit numbering in comment

* json: update numeric rule to be unambiguous

* Apply suggestions from code review

Co-authored-by: Clint Herron <hanclinto@gmail.com>

* Update examples/server/public/json-schema-to-grammar.mjs

Co-authored-by: Clint Herron <hanclinto@gmail.com>

* json: fix integral-part

* grammar: add repetition tests

---------

Co-authored-by: Clint Herron <hanclinto@gmail.com>
---
 common/grammar-parser.cpp                     | 144 ++++-
 common/json-schema-to-grammar.cpp             |  80 +--
 examples/json_schema_to_grammar.py            |  70 +--
 examples/pydantic_models_to_grammar.py        |   2 +-
 .../server/public/json-schema-to-grammar.mjs  |  67 +-
 grammars/README.md                            |  12 +-
 tests/test-grammar-integration.cpp            |  76 +++
 tests/test-grammar-parser.cpp                 | 591 +++++++++++++-----
 tests/test-json-schema-to-grammar.cpp         | 112 ++--
 9 files changed, 736 insertions(+), 418 deletions(-)

diff --git a/common/grammar-parser.cpp b/common/grammar-parser.cpp
index b5bc7d49b..79d2b0354 100644
--- a/common/grammar-parser.cpp
+++ b/common/grammar-parser.cpp
@@ -46,8 +46,12 @@ namespace grammar_parser {
         state.rules[rule_id] = rule;
     }
 
+    static bool is_digit_char(char c) {
+        return '0' <= c && c <= '9';
+    }
+
     static bool is_word_char(char c) {
-        return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
+        return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || is_digit_char(c);
     }
 
     static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
@@ -99,6 +103,17 @@ namespace grammar_parser {
         return pos;
     }
 
+    static const char * parse_int(const char * src) {
+        const char * pos = src;
+        while (is_digit_char(*pos)) {
+            pos++;
+        }
+        if (pos == src) {
+            throw std::runtime_error(std::string("expecting integer at ") + src);
+        }
+        return pos;
+    }
+
     static std::pair<uint32_t, const char *> parse_char(const char * src) {
         if (*src == '\\') {
             switch (src[1]) {
@@ -137,6 +152,60 @@ namespace grammar_parser {
             bool                                 is_nested) {
         size_t last_sym_start = out_elements.size();
         const char * pos = src;
+
+        auto handle_repetitions = [&](int min_times, int max_times) {
+
+            if (last_sym_start == out_elements.size()) {
+                throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
+            }
+
+            // apply transformation to previous symbol (last_sym_start to end) according to
+            // the following rewrite rules:
+            // S{m,n} --> S S S (m times) S'(n-m)
+            //            S'(x)   ::= S S'(x-1) |
+            //            (... n-m definitions of these S' rules ...)
+            //            S'(1)   ::= S |
+            // S{m,} -->  S S S (m times) S'
+            //            S'     ::= S S' |
+            // S*     --> S{0,}
+            //        --> S'     ::= S S' |
+            // S+     --> S{1,}
+            //        --> S S'
+            //            S'     ::= S S' |
+            // S?     --> S{0,1}
+            //        --> S'
+            //            S'     ::= S |
+
+            std::vector<llama_grammar_element> previous_elements(out_elements.begin() + last_sym_start, out_elements.end());
+            if (min_times == 0) {
+                out_elements.resize(last_sym_start);
+            } else {
+                // Repeat the previous elements (min_times - 1) times
+                for (int i = 1; i < min_times; i++) {
+                    out_elements.insert(out_elements.end(), previous_elements.begin(), previous_elements.end());
+                }
+            }
+
+            uint32_t last_rec_rule_id = 0;
+            auto n_opt = max_times < 0 ? 1 : max_times - min_times;
+
+            std::vector<llama_grammar_element> rec_rule(previous_elements);
+            for (int i = 0; i < n_opt; i++) {
+                rec_rule.resize(previous_elements.size());
+                uint32_t rec_rule_id = generate_symbol_id(state, rule_name);
+                if (i > 0 || max_times < 0) {
+                    rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id});
+                }
+                rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
+                rec_rule.push_back({LLAMA_GRETYPE_END, 0});
+                add_rule(state, rec_rule_id, rec_rule);
+                last_rec_rule_id = rec_rule_id;
+            }
+            if (n_opt > 0) {
+                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
+            }
+        };
+
         while (*pos) {
             if (*pos == '"') { // literal string
                 pos++;
@@ -197,40 +266,47 @@ namespace grammar_parser {
                     throw std::runtime_error(std::string("expecting ')' at ") + pos);
                 }
                 pos = parse_space(pos + 1, is_nested);
-            } else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
-                if (last_sym_start == out_elements.size()) {
-                    throw std::runtime_error(std::string("expecting preceding item to */+/? at ") + pos);
-                }
-
-                // apply transformation to previous symbol (last_sym_start to end) according to
-                // rewrite rules:
-                // S* --> S' ::= S S' |
-                // S+ --> S' ::= S S' | S
-                // S? --> S' ::= S |
-                uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
-                std::vector<llama_grammar_element> sub_rule;
-                // add preceding symbol to generated rule
-                sub_rule.insert(
-                    sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
-                if (*pos == '*' || *pos == '+') {
-                    // cause generated rule to recurse
-                    sub_rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
-                }
-                // mark start of alternate def
-                sub_rule.push_back({LLAMA_GRETYPE_ALT, 0});
-                if (*pos == '+') {
-                    // add preceding symbol as alternate only for '+' (otherwise empty)
-                    sub_rule.insert(
-                        sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
-                }
-                sub_rule.push_back({LLAMA_GRETYPE_END, 0});
-                add_rule(state, sub_rule_id, sub_rule);
-
-                // in original rule, replace previous symbol with reference to generated rule
-                out_elements.resize(last_sym_start);
-                out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
-
+            } else if (*pos == '*') {
                 pos = parse_space(pos + 1, is_nested);
+                handle_repetitions(0, -1);
+            } else if (*pos == '+') {
+                pos = parse_space(pos + 1, is_nested);
+                handle_repetitions(1, -1);
+            } else if (*pos == '?') {
+                pos = parse_space(pos + 1, is_nested);
+                handle_repetitions(0, 1);
+            } else if (*pos == '{') {
+                pos = parse_space(pos + 1, is_nested);
+
+                if (!is_digit_char(*pos)) {
+                    throw std::runtime_error(std::string("expecting an int at ") + pos);
+                }
+                const char * int_end = parse_int(pos);
+                int min_times = std::stoul(std::string(pos, int_end - pos));
+                pos = parse_space(int_end, is_nested);
+
+                int max_times = -1;
+
+                if (*pos == '}') {
+                    max_times = min_times;
+                    pos = parse_space(pos + 1, is_nested);
+                } else if (*pos == ',') {
+                    pos = parse_space(pos + 1, is_nested);
+
+                    if (is_digit_char(*pos)) {
+                        const char * int_end = parse_int(pos);
+                        max_times = std::stoul(std::string(pos, int_end - pos));
+                        pos = parse_space(int_end, is_nested);
+                    }
+
+                    if (*pos != '}') {
+                        throw std::runtime_error(std::string("expecting '}' at ") + pos);
+                    }
+                    pos = parse_space(pos + 1, is_nested);
+                } else {
+                    throw std::runtime_error(std::string("expecting ',' at ") + pos);
+                }
+                handle_repetitions(min_times, max_times);
             } else {
                 break;
             }
diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index 9a71f5d8d..737bae27c 100644
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -16,58 +16,27 @@ static std::string join(Iterator begin, Iterator end, const std::string & separa
 
 static std::string repeat(const std::string & str, size_t n);
 
-static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "", bool item_rule_is_literal = false) {
+static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
+    auto has_max = max_items != std::numeric_limits<int>::max();
+
+    if (min_items == 0 && max_items == 1) {
+        return item_rule + "?";
+    }
+
     if (separator_rule.empty()) {
-        if (min_items == 0 && max_items == 1) {
-            return item_rule + "?";
-        } else if (min_items == 1 && max_items == std::numeric_limits<int>::max()) {
+        if (min_items == 1 && !has_max) {
             return item_rule + "+";
-        }
-    }
-
-    std::string result;
-    if (min_items > 0) {
-        if (item_rule_is_literal && separator_rule.empty()) {
-            result = "\"" + repeat(std::string(item_rule.begin() + 1, item_rule.end() - 1), min_items) + "\"";
+        } else if (min_items == 0 && !has_max) {
+            return item_rule + "*";
         } else {
-            std::vector<std::string> items(min_items, item_rule);
-            result = join(items.begin(), items.end(), separator_rule.empty() ? " " : " " + separator_rule + " ");
+            return item_rule + "{" + std::to_string(min_items) + "," + (has_max ? std::to_string(max_items) : "") + "}";
         }
     }
 
-    std::function<std::string(int, bool)> opt_repetitions = [&](int up_to_n, bool prefix_with_sep) -> std::string {
-        auto content = prefix_with_sep && !separator_rule.empty() ? separator_rule + " " + item_rule : item_rule;
-
-        if (up_to_n == 0) {
-            return "";
-        } else if (up_to_n == 1) {
-            return "(" + content + ")?";
-        } else if (!separator_rule.empty() && !prefix_with_sep) {
-            return "(" + content + " " + opt_repetitions(up_to_n - 1, true) + ")?";
-        } else {
-            std::string res = repeat("(" + content + " ", up_to_n);
-            // strip trailing space
-            res = res.substr(0, res.length() - 1);
-            res += repeat(")?", up_to_n);
-            return res;
-        }
-    };
-
-    if (min_items > 0 && max_items != min_items) {
-        result += " ";
+    auto result = item_rule + " " + build_repetition("(" + separator_rule + " " + item_rule + ")", min_items == 0 ? 0 : min_items - 1, has_max ? max_items - 1 : max_items);
+    if (min_items == 0) {
+        result = "(" + result + ")?";
     }
-
-    if (max_items != std::numeric_limits<int>::max()) {
-        result += opt_repetitions(max_items - min_items, min_items > 0);
-    } else {
-        std::string item_operator = "(" + (separator_rule.empty() ? "" : separator_rule + " ") + item_rule + ")";
-        if (min_items == 0 && !separator_rule.empty()) {
-            result = "(" + item_rule + " " + item_operator + "*)?";
-        } else {
-            result += item_operator + "*";
-        }
-    }
-
     return result;
 }
 
@@ -78,30 +47,24 @@ struct BuiltinRule {
     std::vector<std::string> deps;
 };
 
-const std::string _up_to_15_digits = build_repetition("[0-9]", 0, 15);
-
 std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
     {"boolean", {"(\"true\" | \"false\") space", {}}},
-    {"decimal-part", {"[0-9] " + _up_to_15_digits, {}}},
-    {"integral-part", {"[0-9] | [1-9] " + _up_to_15_digits, {}}},
+    {"decimal-part", {"[0-9]{1,16}", {}}},
+    {"integral-part", {"[0] | [1-9] [0-9]{0,15}", {}}},
     {"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", {"integral-part", "decimal-part"}}},
     {"integer", {"(\"-\"? integral-part) space", {"integral-part"}}},
     {"value", {"object | array | string | number | boolean | null", {"object", "array", "string", "number", "boolean", "null"}}},
     {"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
     {"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
-    {"uuid", {"\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
-                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
-                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
-                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
-                "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space", {}}},
-    {"char",   {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])", {}}},
+    {"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\" space", {}}},
+    {"char",   {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}},
     {"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
     {"null", {"\"null\" space", {}}},
 };
 
 std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
-    {"date", {"[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
-    {"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
+    {"date", {"[0-9]{4} \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
+    {"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9]{3} )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
     {"date-time", {"date \"T\" time", {"date", "time"}}},
     {"date-string", {"\"\\\"\" date \"\\\"\" space", {"date"}}},
     {"time-string", {"\"\\\"\" time \"\\\"\" space", {"time"}}},
@@ -385,8 +348,7 @@ private:
                         sub_is_literal ? "\"" + sub + "\"" : sub,
                         min_times,
                         max_times,
-                        "",
-                        sub_is_literal
+                        ""
                     );
                     seq.back().second = false;
                 } else {
diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py
index 826cd3f72..7d889c3fe 100755
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@@ -6,52 +6,22 @@ import re
 import sys
 from typing import Any, Dict, List, Set, Tuple, Union
 
-def _build_repetition(item_rule, min_items, max_items, separator_rule=None, item_rule_is_literal=False):
+
+def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
+
+    if min_items == 0 and max_items == 1:
+        return f'{item_rule}?'
+
     if not separator_rule:
-        if min_items == 0 and max_items == 1:
-            return f'{item_rule}?'
-        elif min_items == 1 and max_items is None:
+        if min_items == 1 and max_items is None:
             return f'{item_rule}+'
-
-    result = ''
-
-    if min_items > 0:
-        if item_rule_is_literal and separator_rule is None:
-            result = '"' + (item_rule[1:-1] * min_items) + '"'
+        elif min_items == 0 and max_items is None:
+            return f'{item_rule}*'
         else:
-            result = (f' {separator_rule} ' if separator_rule else ' ').join([item_rule] * min_items)
+            return f'{item_rule}{{{min_items},{max_items if max_items is not None else ""}}}'
 
-    def opt_repetitions(up_to_n, prefix_with_sep=False):
-        '''
-            - n=4, no sep:             '(a (a (a (a)?)?)?)?'
-            - n=4, sep=',', prefix:    '("," a ("," a ("," a ("," a)?)?)?)?'
-            - n=4, sep=',', no prefix: '(a ("," a ("," a ("," a)?)?)?)?'
-        '''
-
-        content = f'{separator_rule} {item_rule}' if prefix_with_sep and separator_rule else item_rule
-        if up_to_n == 0:
-            return ''
-        elif up_to_n == 1:
-            return f'({content})?'
-        elif separator_rule and not prefix_with_sep:
-            return f'({content} {opt_repetitions(up_to_n - 1, prefix_with_sep=True)})?'
-        else:
-            return (f'({content} ' * up_to_n).rstrip() + (')?' * up_to_n)
-
-    if min_items > 0 and max_items != min_items:
-        result += ' '
-
-    if max_items is not None:
-        result += opt_repetitions(max_items - min_items, prefix_with_sep=min_items > 0)
-    else:
-        item_operator = f'({separator_rule + " " if separator_rule else ""}{item_rule})'
-
-        if min_items == 0 and separator_rule:
-            result = f'({item_rule} {item_operator}*)?'
-        else:
-            result += f'{item_operator}*'
-
-    return result
+    result = item_rule + ' ' + _build_repetition(f'({separator_rule} {item_rule})', min_items - 1 if min_items > 0 else 0, max_items - 1 if max_items is not None else None)
+    return f'({result})?' if min_items == 0 else result
 
 
 class BuiltinRule:
@@ -59,31 +29,29 @@ class BuiltinRule:
         self.content = content
         self.deps = deps or []
 
-_up_to_15_digits = _build_repetition('[0-9]', 0, 15)
-
 # whitespace is constrained to a single space char to prevent model "running away" in
 # whitespace. Also maybe improves generation quality?
 SPACE_RULE = '" "?'
 
 PRIMITIVE_RULES = {
     'boolean'      : BuiltinRule('("true" | "false") space', []),
-    'decimal-part' : BuiltinRule('[0-9] ' + _up_to_15_digits, []),
-    'integral-part': BuiltinRule('[0-9] | [1-9] ' + _up_to_15_digits, []),
+    'decimal-part' : BuiltinRule('[0-9]{1,16}', []),
+    'integral-part': BuiltinRule('[0] | [1-9] [0-9]{0,15}', []),
     'number'       : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
     'integer'      : BuiltinRule('("-"? integral-part) space', ['integral-part']),
     'value'        : BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
     'object'       : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
     'array'        : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
-    'uuid'         : BuiltinRule(r'"\"" ' + ' "-" '.join('[0-9a-fA-F]' * n for n in [8, 4, 4, 4, 12]) + r' "\"" space', []),
-    'char'         : BuiltinRule(r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])', []),
+    'uuid'         : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space', []),
+    'char'         : BuiltinRule(r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})', []),
     'string'       : BuiltinRule(r'"\"" char* "\"" space', ['char']),
     'null'         : BuiltinRule('"null" space', []),
 }
 
 # TODO: support "uri", "email" string formats
 STRING_FORMAT_RULES = {
-    'date'            : BuiltinRule('[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
-    'time'            : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
+    'date'            : BuiltinRule('[0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
+    'time'            : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
     'date-time'       : BuiltinRule('date "T" time', ['date', 'time']),
     'date-string'     : BuiltinRule('"\\"" date "\\"" space', ['date']),
     'time-string'     : BuiltinRule('"\\"" time "\\"" space', ['time']),
@@ -333,7 +301,7 @@ class SchemaConverter:
                             sub_rule_ids[sub] = id
                         sub = id
 
-                    seq[-1] = (_build_repetition(f'"{sub}"' if sub_is_literal else sub, min_times, max_times, item_rule_is_literal=sub_is_literal), False)
+                    seq[-1] = (_build_repetition(f'"{sub}"' if sub_is_literal else sub, min_times, max_times), False)
                 else:
                     literal = ''
                     while i < length:
diff --git a/examples/pydantic_models_to_grammar.py b/examples/pydantic_models_to_grammar.py
index 9acc7cc6d..f029c73a2 100644
--- a/examples/pydantic_models_to_grammar.py
+++ b/examples/pydantic_models_to_grammar.py
@@ -624,7 +624,7 @@ string ::= "\"" (
         "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
       )* "\"" ws
 ws ::= ([ \t\n] ws)?
-float ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
+float ::= ("-"? ([0] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
 
 integer ::= [0-9]+"""
 
diff --git a/examples/server/public/json-schema-to-grammar.mjs b/examples/server/public/json-schema-to-grammar.mjs
index 8e0be1b40..cef11eab8 100644
--- a/examples/server/public/json-schema-to-grammar.mjs
+++ b/examples/server/public/json-schema-to-grammar.mjs
@@ -2,57 +2,26 @@
 const SPACE_RULE = '" "?';
 
 function _buildRepetition(itemRule, minItems, maxItems, opts={}) {
+  if (minItems === 0 && maxItems === 1) {
+    return `${itemRule}?`;
+  }
+
+
   const separatorRule = opts.separatorRule ?? '';
   const itemRuleIsLiteral = opts.itemRuleIsLiteral ?? false
 
   if (separatorRule === '') {
-    if (minItems === 0 && maxItems === 1) {
-      return `${itemRule}?`;
-    } else if (minItems === 1 && maxItems === undefined) {
+    if (minItems === 1 && maxItems === undefined) {
       return `${itemRule}+`;
-    }
-  }
-
-  let result = '';
-  if (minItems > 0) {
-    if (itemRuleIsLiteral && separatorRule === '') {
-      result = `"${itemRule.slice(1, -1).repeat(minItems)}"`;
+    } else if (minItems === 0 && maxItems === undefined) {
+      return `${itemRule}*`;
     } else {
-      result = Array.from({ length: minItems }, () => itemRule)
-        .join(separatorRule !== '' ? ` ${separatorRule} ` : ' ');
+      return `${itemRule}{${minItems},${maxItems !== undefined ? maxItems : ''}}`;
     }
   }
 
-  const optRepetitions = (upToN, prefixWithSep=false) => {
-    const content = separatorRule !== '' && prefixWithSep ? `${separatorRule} ${itemRule}` : itemRule;
-    if (upToN === 0) {
-      return '';
-    } else if (upToN === 1) {
-      return `(${content})?`;
-    } else if (separatorRule !== '' && !prefixWithSep) {
-      return `(${content} ${optRepetitions(upToN - 1, true)})?`;
-    } else {
-      return Array.from({ length: upToN }, () => `(${content}`).join(' ').trim() + Array.from({ length: upToN }, () => ')?').join('');
-    }
-  };
-
-  if (minItems > 0 && maxItems !== minItems) {
-    result += ' ';
-  }
-
-  if (maxItems !== undefined) {
-    result += optRepetitions(maxItems - minItems, minItems > 0);
-  } else {
-    const itemOperator = `(${separatorRule !== '' ? separatorRule + ' ' : ''}${itemRule})`;
-
-    if (minItems === 0 && separatorRule !== '') {
-      result = `(${itemRule} ${itemOperator}*)?`;
-    } else {
-      result += `${itemOperator}*`;
-    }
-  }
-
-  return result;
+  const result = itemRule + ' ' + _buildRepetition(`(${separatorRule} ${itemRule})`, minItems > 0 ? minItems - 1 : 0, maxItems !== undefined ? maxItems - 1 : undefined);
+  return minItems === 0 ? `(${result})?` : result;
 }
 
 class BuiltinRule {
@@ -62,27 +31,25 @@ class BuiltinRule {
   }
 }
 
-const UP_TO_15_DIGITS = _buildRepetition('[0-9]', 0, 15);
-
 const PRIMITIVE_RULES = {
   boolean        : new BuiltinRule('("true" | "false") space', []),
-  'decimal-part' : new BuiltinRule('[0-9] ' + UP_TO_15_DIGITS, []),
-  'integral-part': new BuiltinRule('[0-9] | [1-9] ' + UP_TO_15_DIGITS, []),
+  'decimal-part' : new BuiltinRule('[0-9]{1,16}', []),
+  'integral-part': new BuiltinRule('[0] | [1-9] [0-9]{0,15}', []),
   number         : new BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
   integer        : new BuiltinRule('("-"? integral-part) space', ['integral-part']),
   value          : new BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
   object         : new BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
   array          : new BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
-  uuid           : new BuiltinRule('"\\"" ' + [8, 4, 4, 4, 12].map(n => [...new Array(n)].map(_ => '[0-9a-fA-F]').join('')).join(' "-" ') + ' "\\"" space', []),
-  char           : new BuiltinRule(`[^"\\\\] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])`, []),
+  uuid           : new BuiltinRule('"\\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\\"" space', []),
+  char           : new BuiltinRule(`[^"\\\\] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F]{4})`, []),
   string         : new BuiltinRule(`"\\"" char* "\\"" space`, ['char']),
   null           : new BuiltinRule('"null" space', []),
 };
 
 // TODO: support "uri", "email" string formats
 const STRING_FORMAT_RULES = {
-  'date'            : new BuiltinRule('[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
-  'time'            : new BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
+  'date'            : new BuiltinRule('[0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
+  'time'            : new BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
   'date-time'       : new BuiltinRule('date "T" time', ['date', 'time']),
   'date-string'     : new BuiltinRule('"\\"" date "\\"" space', ['date']),
   'time-string'     : new BuiltinRule('"\\"" time "\\"" space', ['time']),
diff --git a/grammars/README.md b/grammars/README.md
index 2b8384d9d..3ffc7cec0 100644
--- a/grammars/README.md
+++ b/grammars/README.md
@@ -59,9 +59,13 @@ Parentheses `()` can be used to group sequences, which allows for embedding alte
 
 ## Repetition and Optional Symbols
 
-- `*` after a symbol or sequence means that it can be repeated zero or more times.
-- `+` denotes that the symbol or sequence should appear one or more times.
-- `?` makes the preceding symbol or sequence optional.
+- `*` after a symbol or sequence means that it can be repeated zero or more times (equivalent to `{0,}`).
+- `+` denotes that the symbol or sequence should appear one or more times (equivalent to `{1,}`).
+- `?` makes the preceding symbol or sequence optional (equivalent to `{0,1}`).
+- `{m}` repeats the precedent symbol or sequence exactly `m` times
+- `{m,}` repeats the precedent symbol or sequence at least `m` times
+- `{m,n}` repeats the precedent symbol or sequence at between `m` and `n` times (included)
+- `{0,n}` repeats the precedent symbol or sequence at most `n` times (included)
 
 ## Comments and newlines
 
@@ -98,4 +102,4 @@ Grammars currently have performance gotchas (see https://github.com/ggerganov/ll
 
 A common pattern is to allow repetitions of a pattern `x` up to N times.
 
-While semantically correct, the syntax `x? x? x?.... x?` (with N repetitions) will result in extremely slow inference. Instead, you can write `(x (x (x ... (x)?...)?)?)?` (w/ N-deep nesting)
+While semantically correct, the syntax `x? x? x?.... x?` (with N repetitions) may result in extremely slow sampling. Instead, you can write `x{0,N}` (or `(x (x (x ... (x)?...)?)?)?` w/ N-deep nesting in earlier llama.cpp versions).
diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp
index 01c5bb27a..9bdab05af 100644
--- a/tests/test-grammar-integration.cpp
+++ b/tests/test-grammar-integration.cpp
@@ -292,6 +292,82 @@ static void test_quantifiers() {
             "catyyy",
         }
     );
+    test_grammar(
+        "simple exact repetition",
+        // Grammar
+        R"""(
+            root ::= [ab]{4}
+        )""",
+        // Passing strings
+        {
+            "aaaa",
+            "bbbb",
+            "abab",
+        },
+        // Failing strings
+        {
+            "a",
+            "b",
+            "aaaaa",
+        }
+    );
+    test_grammar(
+        "simple min repetition",
+        // Grammar
+        R"""(
+            root ::= [ab]{4,}
+        )""",
+        // Passing strings
+        {
+            "aaaa",
+            "aaaaab",
+            "bbbb",
+            "ababab",
+        },
+        // Failing strings
+        {
+            "",
+            "aba",
+        }
+    );
+    test_grammar(
+        "simple max repetition",
+        // Grammar
+        R"""(
+            root ::= [ab]{0,4}
+        )""",
+        // Passing strings
+        {
+            "",
+            "a",
+            "aa",
+            "aaa",
+            "aaab",
+        },
+        // Failing strings
+        {
+            "aaaaa",
+        }
+    );
+    test_grammar(
+        "min / max repetition",
+        // Grammar
+        R"""(
+            root ::= ("0x" [A-F0-9]{2} " "?){3,5}
+        )""",
+        // Passing strings
+        {
+            "0xFF 0x12 0xAB",
+            "0xFF 0x12 0xAB 0x00 0x00",
+        },
+        // Failing strings
+        {
+            "",
+            "0xFF",
+            "0xFF 0x12",
+            "0xFF 0x12 0xAB 0x00 0x00 0x00",
+        }
+    );
 }
 
 static void test_failure_missing_root() {
diff --git a/tests/test-grammar-parser.cpp b/tests/test-grammar-parser.cpp
index 91939e276..5df5abb25 100644
--- a/tests/test-grammar-parser.cpp
+++ b/tests/test-grammar-parser.cpp
@@ -7,28 +7,79 @@
 
 #include <cassert>
 
-int main()
-{
-    grammar_parser::parse_state parsed_grammar;
+static const char * type_str(llama_gretype type) {
+    switch (type) {
+        case LLAMA_GRETYPE_CHAR: return "LLAMA_GRETYPE_CHAR";
+        case LLAMA_GRETYPE_CHAR_NOT: return "LLAMA_GRETYPE_CHAR_NOT";
+        case LLAMA_GRETYPE_CHAR_ALT: return "LLAMA_GRETYPE_CHAR_ALT";
+        case LLAMA_GRETYPE_CHAR_RNG_UPPER: return "LLAMA_GRETYPE_CHAR_RNG_UPPER";
+        case LLAMA_GRETYPE_RULE_REF: return "LLAMA_GRETYPE_RULE_REF";
+        case LLAMA_GRETYPE_ALT: return "LLAMA_GRETYPE_ALT";
+        case LLAMA_GRETYPE_END: return "LLAMA_GRETYPE_END";
+        default: return "?";
+    }
+}
 
-    const char *grammar_bytes = R"""(root  ::= (expr "=" term "\n")+
-expr  ::= term ([-+*/] term)*
-term  ::= [0-9]+)""";
+static void verify_parsing(const char *grammar_bytes, const std::vector<std::pair<std::string, uint32_t>> expected, const std::vector<llama_grammar_element> &expected_rules) {
+    uint32_t index = 0;
+    grammar_parser::parse_state parsed_grammar = grammar_parser::parse(grammar_bytes);
 
-    parsed_grammar = grammar_parser::parse(grammar_bytes);
+    std::map<uint32_t, std::string> symbol_names;
+    for (auto it = parsed_grammar.symbol_ids.begin(); it != parsed_grammar.symbol_ids.end(); ++it) {
+        symbol_names[it->second] = it->first;
+    }
 
-    std::vector<std::pair<std::string, uint32_t>> expected = {
-        {"expr", 2},
-        {"expr_5", 5},
-        {"expr_6", 6},
-        {"root", 0},
-        {"root_1", 1},
-        {"root_4", 4},
-        {"term", 3},
-        {"term_7", 7},
+    auto print_all = [&]() {
+        fprintf(stderr, "    verify_parsing(R\"\"\"(%s)\"\"\", {\n", grammar_bytes);
+        for (auto it = parsed_grammar.symbol_ids.begin(); it != parsed_grammar.symbol_ids.end(); ++it) {
+            fprintf(stderr, "        {\"%s\", %u},\n", it->first.c_str(), it->second);
+        }
+        fprintf(stderr, "    }, {\n");
+        for (size_t i_rule = 0; i_rule < parsed_grammar.rules.size(); i_rule++) {
+            fprintf(stderr, "        // %s (index %zu)\n", symbol_names[i_rule].c_str(), i_rule);
+            auto & rule = parsed_grammar.rules[i_rule];
+            for (uint32_t i = 0; i < rule.size(); i++) {
+                std::string rule_str;
+                fprintf(stderr, "        {%s, ", type_str(rule[i].type));
+                if (rule[i].type == LLAMA_GRETYPE_CHAR || rule[i].type == LLAMA_GRETYPE_CHAR_ALT ||
+                    rule[i].type == LLAMA_GRETYPE_CHAR_NOT || rule[i].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
+                    char c = rule[i].value;
+                    if (c == '\n') {
+                        fprintf(stderr, "'\\n'");
+                    } else if (c == '\t') {
+                        fprintf(stderr, "'\\t'");
+                    } else if (c == '\r') {
+                        fprintf(stderr, "'\\r'");
+                    } else if (c == '\0') {
+                        fprintf(stderr, "'\\0'");
+                    } else {
+                        fprintf(stderr, "'%c'", c);
+                    }
+                } else if (rule[i].type == LLAMA_GRETYPE_RULE_REF) {
+                    fprintf(stderr, "/* %s */ %u", symbol_names[rule[i].value].c_str(), rule[i].value);
+                } else {
+                    fprintf(stderr, "%u", rule[i].value);
+                }
+                fprintf(stderr, "},\n");
+            }
+        }
+        fprintf(stderr, "    });\n");
     };
 
-    uint32_t index = 0;
+    if (getenv("TEST_GRAMMAR_PARSER_PRINT_ALL")) {
+        print_all();
+        fprintf(stderr, "\n");
+        return;
+    }
+
+    fprintf(stderr, "Testing grammar:%s\n", grammar_bytes);
+
+    if (parsed_grammar.symbol_ids.size() != expected.size()) {
+        fprintf(stderr, "Code to update expectation (set TEST_GRAMMAR_PARSER_PRINT_ALL=1 to print all):\n");
+        print_all();
+        assert(parsed_grammar.symbol_ids.size() == expected.size());
+    }
+
     for (auto it = parsed_grammar.symbol_ids.begin(); it != parsed_grammar.symbol_ids.end(); ++it)
     {
         std::string key = it->first;
@@ -38,51 +89,18 @@ term  ::= [0-9]+)""";
         // pretty print error message before asserting
         if (expected_pair.first != key || expected_pair.second != value)
         {
+            fprintf(stderr, "index: %u\n", index);
             fprintf(stderr, "expected_pair: %s, %u\n", expected_pair.first.c_str(), expected_pair.second);
             fprintf(stderr, "actual_pair: %s, %u\n", key.c_str(), value);
             fprintf(stderr, "expected_pair != actual_pair\n");
+            fprintf(stderr, "Code to update expectation (set TEST_GRAMMAR_PARSER_PRINT_ALL=1 to print all):\n");
+            print_all();
         }
 
         assert(expected_pair.first == key && expected_pair.second == value);
 
         index++;
     }
-    std::vector<llama_grammar_element> expected_rules = {
-        {LLAMA_GRETYPE_RULE_REF, 4},
-        {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_RULE_REF, 2},
-        {LLAMA_GRETYPE_CHAR, 61},
-        {LLAMA_GRETYPE_RULE_REF, 3},
-        {LLAMA_GRETYPE_CHAR, 10},
-        {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_RULE_REF, 3},
-        {LLAMA_GRETYPE_RULE_REF, 6},
-        {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_RULE_REF, 7},
-        {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_RULE_REF, 1},
-        {LLAMA_GRETYPE_RULE_REF, 4},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_RULE_REF, 1},
-        {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_CHAR, 45},
-        {LLAMA_GRETYPE_CHAR_ALT, 43},
-        {LLAMA_GRETYPE_CHAR_ALT, 42},
-        {LLAMA_GRETYPE_CHAR_ALT, 47},
-        {LLAMA_GRETYPE_RULE_REF, 3},
-        {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_RULE_REF, 5},
-        {LLAMA_GRETYPE_RULE_REF, 6},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_CHAR, 48},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
-        {LLAMA_GRETYPE_RULE_REF, 7},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_CHAR, 48},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
-        {LLAMA_GRETYPE_END, 0},
-    };
 
     index = 0;
     for (auto rule : parsed_grammar.rules)
@@ -97,28 +115,306 @@ term  ::= [0-9]+)""";
             if (expected_element.type != element.type || expected_element.value != element.value)
             {
                 fprintf(stderr, "index: %u\n", index);
-                fprintf(stderr, "expected_element: %d, %u\n", expected_element.type, expected_element.value);
-                fprintf(stderr, "actual_element: %d, %u\n", element.type, element.value);
+                fprintf(stderr, "expected_element: %s, %u\n", type_str(expected_element.type), expected_element.value);
+                fprintf(stderr, "actual_element: %s, %u\n", type_str(element.type), element.value);
                 fprintf(stderr, "expected_element != actual_element\n");
+                fprintf(stderr, "all elements:\n");
+                fprintf(stderr, "Code to update expectation (set TEST_GRAMMAR_PARSER_PRINT_ALL=1 to print all):\n");
+                print_all();
             }
 
             assert(expected_element.type == element.type && expected_element.value == element.value);
             index++;
         }
     }
+}
 
-    const char *longer_grammar_bytes = R"""(
-    root  ::= (expr "=" ws term "\n")+
-    expr  ::= term ([-+*/] term)*
-    term  ::= ident | num | "(" ws expr ")" ws
-    ident ::= [a-z] [a-z0-9_]* ws
-    num   ::= [0-9]+ ws
-    ws    ::= [ \t\n]*
-    )""";
+static void verify_failure(const char *grammar_bytes) {
+    fprintf(stderr, "Testing expected failure:%s\n", grammar_bytes);
+    auto result = grammar_parser::parse(grammar_bytes);
+    assert(result.rules.empty() && "should have failed");
+}
 
-    parsed_grammar = grammar_parser::parse(longer_grammar_bytes);
+int main()
+{
+    verify_failure(R"""(
+        root ::= "a"{,}"
+    )""");
 
-    expected = {
+    verify_failure(R"""(
+        root ::= "a"{,10}"
+    )""");
+
+    verify_parsing(R"""(
+        root  ::= "a"
+    )""", {
+        {"root", 0},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= "a" | [bdx-z] | [^1-3]
+    )""", {
+        {"root", 0},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_CHAR, 'b'},
+        {LLAMA_GRETYPE_CHAR_ALT, 'd'},
+        {LLAMA_GRETYPE_CHAR_ALT, 'x'},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 'z'},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_CHAR_NOT, '1'},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '3'},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= a+
+        a     ::= "a"
+    )""", {
+        {"a", 1},
+        {"root", 0},
+        {"root_2", 2},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_RULE_REF, /* a */ 1},
+        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
+        {LLAMA_GRETYPE_END, 0},
+        // a (index 1)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_END, 0},
+        // root_2 (index 2)
+        {LLAMA_GRETYPE_RULE_REF, /* a */ 1},
+        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= "a"+
+    )""", {
+        {"root", 0},
+        {"root_1", 1},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_END, 0},
+        // root_1 (index 1)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= a?
+        a     ::= "a"
+    )""", {
+        {"a", 1},
+        {"root", 0},
+        {"root_2", 2},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
+        {LLAMA_GRETYPE_END, 0},
+        // a (index 1)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_END, 0},
+        // root_2 (index 2)
+        {LLAMA_GRETYPE_RULE_REF, /* a */ 1},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= "a"?
+    )""", {
+        {"root", 0},
+        {"root_1", 1},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_END, 0},
+        // root_1 (index 1)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= a*
+        a     ::= "a"
+    )""", {
+        {"a", 1},
+        {"root", 0},
+        {"root_2", 2},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
+        {LLAMA_GRETYPE_END, 0},
+        // a (index 1)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_END, 0},
+        // root_2 (index 2)
+        {LLAMA_GRETYPE_RULE_REF, /* a */ 1},
+        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= "a"*
+    )""", {
+        {"root", 0},
+        {"root_1", 1},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_END, 0},
+        // root_1 (index 1)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= "a"{2}
+    )""", {
+        {"root", 0},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= "a"{2,}
+    )""", {
+        {"root", 0},
+        {"root_1", 1},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_END, 0},
+        // root_1 (index 1)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= "a"{ 4}
+    )""", {
+        {"root", 0},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= "a"{2,4}
+    )""", {
+        {"root", 0},
+        {"root_1", 1},
+        {"root_2", 2},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
+        {LLAMA_GRETYPE_END, 0},
+        // root_1 (index 1)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+        // root_2 (index 2)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= (expr "=" term "\n")+
+        expr  ::= term ([-+*/] term)*
+        term  ::= [0-9]+
+    )""", {
+        {"expr", 2},
+        {"expr_5", 5},
+        {"expr_6", 6},
+        {"root", 0},
+        {"root_1", 1},
+        {"root_4", 4},
+        {"term", 3},
+        {"term_7", 7},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_RULE_REF, /* root_4 */ 4},
+        {LLAMA_GRETYPE_END, 0},
+        // root_1 (index 1)
+        {LLAMA_GRETYPE_RULE_REF, /* expr */ 2},
+        {LLAMA_GRETYPE_CHAR, '='},
+        {LLAMA_GRETYPE_RULE_REF, /* term */ 3},
+        {LLAMA_GRETYPE_CHAR, '\n'},
+        {LLAMA_GRETYPE_END, 0},
+        // expr (index 2)
+        {LLAMA_GRETYPE_RULE_REF, /* term */ 3},
+        {LLAMA_GRETYPE_RULE_REF, /* expr_6 */ 6},
+        {LLAMA_GRETYPE_END, 0},
+        // term (index 3)
+        {LLAMA_GRETYPE_CHAR, '0'},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'},
+        {LLAMA_GRETYPE_RULE_REF, /* term_7 */ 7},
+        {LLAMA_GRETYPE_END, 0},
+        // root_4 (index 4)
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_RULE_REF, /* root_4 */ 4},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+        // expr_5 (index 5)
+        {LLAMA_GRETYPE_CHAR, '-'},
+        {LLAMA_GRETYPE_CHAR_ALT, '+'},
+        {LLAMA_GRETYPE_CHAR_ALT, '*'},
+        {LLAMA_GRETYPE_CHAR_ALT, '/'},
+        {LLAMA_GRETYPE_RULE_REF, /* term */ 3},
+        {LLAMA_GRETYPE_END, 0},
+        // expr_6 (index 6)
+        {LLAMA_GRETYPE_RULE_REF, /* expr_5 */ 5},
+        {LLAMA_GRETYPE_RULE_REF, /* expr_6 */ 6},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+        // term_7 (index 7)
+        {LLAMA_GRETYPE_CHAR, '0'},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'},
+        {LLAMA_GRETYPE_RULE_REF, /* term_7 */ 7},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    });
+
+    verify_parsing(R"""(
+        root  ::= (expr "=" ws term "\n")+
+        expr  ::= term ([-+*/] term)*
+        term  ::= ident | num | "(" ws expr ")" ws
+        ident ::= [a-z] [a-z0-9_]* ws
+        num   ::= [0-9]+ ws
+        ws    ::= [ \t\n]*
+    )""", {
         {"expr", 2},
         {"expr_6", 6},
         {"expr_7", 7},
@@ -132,119 +428,88 @@ term  ::= [0-9]+)""";
         {"term", 4},
         {"ws", 3},
         {"ws_12", 12},
-    };
-
-    index = 0;
-    for (auto it = parsed_grammar.symbol_ids.begin(); it != parsed_grammar.symbol_ids.end(); ++it)
-    {
-        std::string key = it->first;
-        uint32_t value = it->second;
-        std::pair<std::string, uint32_t> expected_pair = expected[index];
-
-        // pretty print error message before asserting
-        if (expected_pair.first != key || expected_pair.second != value)
-        {
-            fprintf(stderr, "expected_pair: %s, %u\n", expected_pair.first.c_str(), expected_pair.second);
-            fprintf(stderr, "actual_pair: %s, %u\n", key.c_str(), value);
-            fprintf(stderr, "expected_pair != actual_pair\n");
-        }
-
-        assert(expected_pair.first == key && expected_pair.second == value);
-
-        index++;
-    }
-    expected_rules = {
-        {LLAMA_GRETYPE_RULE_REF, 5},
+    }, {
+        // root (index 0)
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_RULE_REF, /* root_5 */ 5},
         {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_RULE_REF, 2},
-        {LLAMA_GRETYPE_CHAR, 61},
-        {LLAMA_GRETYPE_RULE_REF, 3},
-        {LLAMA_GRETYPE_RULE_REF, 4},
-        {LLAMA_GRETYPE_CHAR, 10},
+        // root_1 (index 1)
+        {LLAMA_GRETYPE_RULE_REF, /* expr */ 2},
+        {LLAMA_GRETYPE_CHAR, '='},
+        {LLAMA_GRETYPE_RULE_REF, /* ws */ 3},
+        {LLAMA_GRETYPE_RULE_REF, /* term */ 4},
+        {LLAMA_GRETYPE_CHAR, '\n'},
         {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_RULE_REF, 4},
-        {LLAMA_GRETYPE_RULE_REF, 7},
+        // expr (index 2)
+        {LLAMA_GRETYPE_RULE_REF, /* term */ 4},
+        {LLAMA_GRETYPE_RULE_REF, /* expr_7 */ 7},
         {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_RULE_REF, 12},
+        // ws (index 3)
+        {LLAMA_GRETYPE_RULE_REF, /* ws_12 */ 12},
         {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_RULE_REF, 8},
+        // term (index 4)
+        {LLAMA_GRETYPE_RULE_REF, /* ident */ 8},
         {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_RULE_REF, 9},
+        {LLAMA_GRETYPE_RULE_REF, /* num */ 9},
         {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_CHAR, 40},
-        {LLAMA_GRETYPE_RULE_REF, 3},
-        {LLAMA_GRETYPE_RULE_REF, 2},
-        {LLAMA_GRETYPE_CHAR, 41},
-        {LLAMA_GRETYPE_RULE_REF, 3},
+        {LLAMA_GRETYPE_CHAR, '('},
+        {LLAMA_GRETYPE_RULE_REF, /* ws */ 3},
+        {LLAMA_GRETYPE_RULE_REF, /* expr */ 2},
+        {LLAMA_GRETYPE_CHAR, ')'},
+        {LLAMA_GRETYPE_RULE_REF, /* ws */ 3},
         {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_RULE_REF, 1},
-        {LLAMA_GRETYPE_RULE_REF, 5},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_RULE_REF, 1},
-        {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_CHAR, 45},
-        {LLAMA_GRETYPE_CHAR_ALT, 43},
-        {LLAMA_GRETYPE_CHAR_ALT, 42},
-        {LLAMA_GRETYPE_CHAR_ALT, 47},
-        {LLAMA_GRETYPE_RULE_REF, 4},
-        {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_RULE_REF, 6},
-        {LLAMA_GRETYPE_RULE_REF, 7},
+        // root_5 (index 5)
+        {LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
+        {LLAMA_GRETYPE_RULE_REF, /* root_5 */ 5},
         {LLAMA_GRETYPE_ALT, 0},
         {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_CHAR, 97},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 122},
-        {LLAMA_GRETYPE_RULE_REF, 10},
-        {LLAMA_GRETYPE_RULE_REF, 3},
+        // expr_6 (index 6)
+        {LLAMA_GRETYPE_CHAR, '-'},
+        {LLAMA_GRETYPE_CHAR_ALT, '+'},
+        {LLAMA_GRETYPE_CHAR_ALT, '*'},
+        {LLAMA_GRETYPE_CHAR_ALT, '/'},
+        {LLAMA_GRETYPE_RULE_REF, /* term */ 4},
         {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_RULE_REF, 11},
-        {LLAMA_GRETYPE_RULE_REF, 3},
-        {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_CHAR, 97},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 122},
-        {LLAMA_GRETYPE_CHAR_ALT, 48},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
-        {LLAMA_GRETYPE_CHAR_ALT, 95},
-        {LLAMA_GRETYPE_RULE_REF, 10},
+        // expr_7 (index 7)
+        {LLAMA_GRETYPE_RULE_REF, /* expr_6 */ 6},
+        {LLAMA_GRETYPE_RULE_REF, /* expr_7 */ 7},
         {LLAMA_GRETYPE_ALT, 0},
         {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_CHAR, 48},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
-        {LLAMA_GRETYPE_RULE_REF, 11},
-        {LLAMA_GRETYPE_ALT, 0},
-        {LLAMA_GRETYPE_CHAR, 48},
-        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
+        // ident (index 8)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 'z'},
+        {LLAMA_GRETYPE_RULE_REF, /* ident_10 */ 10},
+        {LLAMA_GRETYPE_RULE_REF, /* ws */ 3},
         {LLAMA_GRETYPE_END, 0},
-        {LLAMA_GRETYPE_CHAR, 32},
-        {LLAMA_GRETYPE_CHAR_ALT, 9},
-        {LLAMA_GRETYPE_CHAR_ALT, 10},
-        {LLAMA_GRETYPE_RULE_REF, 12},
+        // num (index 9)
+        {LLAMA_GRETYPE_CHAR, '0'},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'},
+        {LLAMA_GRETYPE_RULE_REF, /* num_11 */ 11},
+        {LLAMA_GRETYPE_RULE_REF, /* ws */ 3},
+        {LLAMA_GRETYPE_END, 0},
+        // ident_10 (index 10)
+        {LLAMA_GRETYPE_CHAR, 'a'},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 'z'},
+        {LLAMA_GRETYPE_CHAR_ALT, '0'},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'},
+        {LLAMA_GRETYPE_CHAR_ALT, '_'},
+        {LLAMA_GRETYPE_RULE_REF, /* ident_10 */ 10},
         {LLAMA_GRETYPE_ALT, 0},
         {LLAMA_GRETYPE_END, 0},
-    };
-
-    index = 0;
-    for (auto rule : parsed_grammar.rules)
-    {
-        // compare rule to expected rule
-        for (uint32_t i = 0; i < rule.size(); i++)
-        {
-            llama_grammar_element element = rule[i];
-            llama_grammar_element expected_element = expected_rules[index];
-
-            // pretty print error message before asserting
-            if (expected_element.type != element.type || expected_element.value != element.value)
-            {
-                fprintf(stderr, "index: %u\n", index);
-                fprintf(stderr, "expected_element: %d, %u\n", expected_element.type, expected_element.value);
-                fprintf(stderr, "actual_element: %d, %u\n", element.type, element.value);
-                fprintf(stderr, "expected_element != actual_element\n");
-            }
-
-            assert(expected_element.type == element.type && expected_element.value == element.value);
-            index++;
-        }
-    }
+        // num_11 (index 11)
+        {LLAMA_GRETYPE_CHAR, '0'},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'},
+        {LLAMA_GRETYPE_RULE_REF, /* num_11 */ 11},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+        // ws_12 (index 12)
+        {LLAMA_GRETYPE_CHAR, ' '},
+        {LLAMA_GRETYPE_CHAR_ALT, '\t'},
+        {LLAMA_GRETYPE_CHAR_ALT, '\n'},
+        {LLAMA_GRETYPE_RULE_REF, /* ws_12 */ 12},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    });
 
     return 0;
 }
diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp
index c5361b5b8..052c08073 100755
--- a/tests/test-json-schema-to-grammar.cpp
+++ b/tests/test-json-schema-to-grammar.cpp
@@ -105,9 +105,9 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         R"""(
             array ::= "[" space ( value ("," space value)* )? "]" space
             boolean ::= ("true" | "false") space
-            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
-            decimal-part ::= [0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
-            integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
+            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
+            decimal-part ::= [0-9]{1,16}
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
             null ::= "null" space
             number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
             object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
@@ -130,18 +130,18 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             ]
         })""",
         R"""(
-            date ::= [0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( "0" [1-9] | [1-2] [0-9] | "3" [0-1] )
+            date ::= [0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( "0" [1-9] | [1-2] [0-9] | "3" [0-1] )
             date-string ::= "\"" date "\"" space
             date-time ::= date "T" time
             date-time-string ::= "\"" date-time "\"" space
             root ::= "[" space tuple-0 "," space uuid "," space tuple-2 "," space tuple-3 "]" space
             space ::= " "?
-            time ::= ([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )
+            time ::= ([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )
             time-string ::= "\"" time "\"" space
             tuple-0 ::= date-string
             tuple-2 ::= time-string
             tuple-3 ::= date-time-string
-            uuid ::= "\"" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "-" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "-" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "-" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "-" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "\"" space
+            uuid ::= "\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space
         )"""
     });
 
@@ -152,7 +152,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             "type": "string"
         })""",
         R"""(
-            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
             root ::= "\"" char* "\"" space
             space ::= " "?
         )"""
@@ -166,7 +166,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             "minLength": 1
         })""",
         R"""(
-            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
             root ::= "\"" char+ "\"" space
             space ::= " "?
         )"""
@@ -180,8 +180,8 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             "minLength": 3
         })""",
         R"""(
-            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
-            root ::= "\"" char char char (char)* "\"" space
+            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
+            root ::= "\"" char{3,} "\"" space
             space ::= " "?
         )"""
     });
@@ -194,8 +194,8 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             "maxLength": 3
         })""",
         R"""(
-            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
-            root ::= "\"" (char (char (char)?)?)? "\"" space
+            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
+            root ::= "\"" char{0,3} "\"" space
             space ::= " "?
         )"""
     });
@@ -209,8 +209,8 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             "maxLength": 4
         })""",
         R"""(
-            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
-            root ::= "\"" char (char (char (char)?)?)? "\"" space
+            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
+            root ::= "\"" char{1,4} "\"" space
             space ::= " "?
         )"""
     });
@@ -234,7 +234,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             "type": "integer"
         })""",
         R"""(
-            integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
             root ::= ("-"? integral-part) space
             space ::= " "?
         )"""
@@ -283,7 +283,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             "prefixItems": [{ "type": "string" }]
         })""",
         R"""(
-            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
             root ::= "[" space string "]" space
             space ::= " "?
             string ::= "\"" char* "\"" space
@@ -297,9 +297,9 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             "prefixItems": [{ "type": "string" }, { "type": "number" }]
         })""",
         R"""(
-            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
-            decimal-part ::= [0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
-            integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
+            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
+            decimal-part ::= [0-9]{1,16}
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
             number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
             root ::= "[" space string "," space number "]" space
             space ::= " "?
@@ -314,8 +314,8 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             "type": "number"
         })""",
         R"""(
-            decimal-part ::= [0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
-            integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
+            decimal-part ::= [0-9]{1,16}
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
             root ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
             space ::= " "?
         )"""
@@ -332,7 +332,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         })""",
         R"""(
             boolean ::= ("true" | "false") space
-            root ::= "[" space boolean "," space boolean ("," space boolean)* "]" space
+            root ::= "[" space boolean ("," space boolean)+ "]" space
             space ::= " "?
         )"""
     });
@@ -348,7 +348,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         })""",
         R"""(
             boolean ::= ("true" | "false") space
-            root ::= "[" space (boolean)? "]" space
+            root ::= "[" space boolean? "]" space
             space ::= " "?
         )"""
     });
@@ -380,12 +380,12 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             "maxItems": 5
         })""",
         R"""(
-            decimal-part ::= [0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
+            decimal-part ::= [0-9]{1,16}
             integer ::= ("-"? integral-part) space
-            integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
             item ::= number | integer
             number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
-            root ::= "[" space item "," space item "," space item ("," space item ("," space item)?)? "]" space
+            root ::= "[" space item ("," space item){2,4} "]" space
             space ::= " "?
         )"""
     });
@@ -438,7 +438,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         })""",
         R"""(
             dot ::= [^\x0A\x0D]
-            root ::= "\"" ("(" root-1 (root-1 (root-1)?)? ")")? root-1 root-1 root-1 "-" root-1 root-1 root-1 root-1 " " "aaa" ("a" ("a")?)? "nd" dot dot dot "\"" space
+            root ::= "\"" ("(" root-1{1,3} ")")? root-1{3,3} "-" root-1{4,4} " " "a"{3,5} "nd" dot dot dot "\"" space
             root-1 ::= [0-9]
             space ::= " "?
         )"""
@@ -466,7 +466,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             a-kv ::= "\"a\"" space ":" space string
             b-kv ::= "\"b\"" space ":" space string
             c-kv ::= "\"c\"" space ":" space string
-            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
             root ::= "{" space b-kv "," space c-kv "," space a-kv "}" space
             space ::= " "?
             string ::= "\"" char* "\"" space
@@ -486,7 +486,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         })""",
         R"""(
             a-kv ::= "\"a\"" space ":" space string
-            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
             root ::= "{" space  (a-kv )? "}" space
             space ::= " "?
             string ::= "\"" char* "\"" space
@@ -510,7 +510,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             b-kv ::= "\"b\"" space ":" space string
             b-rest ::= ( "," space c-kv )?
             c-kv ::= "\"c\"" space ":" space string
-            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
             root ::= "{" space  (a-kv a-rest | b-kv b-rest | c-kv )? "}" space
             space ::= " "?
             string ::= "\"" char* "\"" space
@@ -534,7 +534,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             a-kv ::= "\"a\"" space ":" space string
             b-kv ::= "\"b\"" space ":" space string
             c-kv ::= "\"c\"" space ":" space string
-            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
             d-kv ::= "\"d\"" space ":" space string
             d-rest ::= ( "," space c-kv )?
             root ::= "{" space b-kv "," space a-kv ( "," space ( d-kv d-rest | c-kv ) )? "}" space
@@ -554,9 +554,9 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             additional-kv ::= string ":" space additional-value
             additional-kvs ::= additional-kv ( "," space additional-kv )*
             additional-value ::= "[" space (number ("," space number)*)? "]" space
-            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
-            decimal-part ::= [0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
-            integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
+            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
+            decimal-part ::= [0-9]{1,16}
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
             number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
             root ::= "{" space  (additional-kvs )? "}" space
             space ::= " "?
@@ -574,9 +574,9 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         R"""(
             array ::= "[" space ( value ("," space value)* )? "]" space
             boolean ::= ("true" | "false") space
-            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
-            decimal-part ::= [0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
-            integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
+            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
+            decimal-part ::= [0-9]{1,16}
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
             null ::= "null" space
             number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
             object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
@@ -596,9 +596,9 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         R"""(
             array ::= "[" space ( value ("," space value)* )? "]" space
             boolean ::= ("true" | "false") space
-            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
-            decimal-part ::= [0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
-            integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
+            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
+            decimal-part ::= [0-9]{1,16}
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
             null ::= "null" space
             number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
             object ::= "{" space ( string ":" space value ("," space string ":" space value)* )? "}" space
@@ -637,9 +637,9 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             a-kv ::= "\"a\"" space ":" space number
             additional-kv ::= string ":" space string
             additional-kvs ::= additional-kv ( "," space additional-kv )*
-            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
-            decimal-part ::= [0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
-            integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
+            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
+            decimal-part ::= [0-9]{1,16}
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
             number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
             root ::= "{" space a-kv ( "," space ( additional-kvs ) )? "}" space
             space ::= " "?
@@ -662,9 +662,9 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             a-rest ::= additional-kvs
             additional-kv ::= string ":" space number
             additional-kvs ::= additional-kv ( "," space additional-kv )*
-            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
-            decimal-part ::= [0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
-            integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
+            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
+            decimal-part ::= [0-9]{1,16}
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
             number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
             root ::= "{" space  (a-kv a-rest | additional-kvs )? "}" space
             space ::= " "?
@@ -690,9 +690,9 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             additional-kvs ::= additional-kv ( "," space additional-kv )*
             b-kv ::= "\"b\"" space ":" space number
             b-rest ::= additional-kvs
-            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
-            decimal-part ::= [0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
-            integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
+            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
+            decimal-part ::= [0-9]{1,16}
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
             number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
             root ::= "{" space a-kv ( "," space ( b-kv b-rest | additional-kvs ) )? "}" space
             space ::= " "?
@@ -721,7 +721,7 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             }
         })""",
         R"""(
-            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+            char ::= [^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})
             foo ::= "{" space foo-a-kv "}" space
             foo-a-kv ::= "\"a\"" space ":" space string
             root ::= foo
@@ -753,10 +753,10 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             alternative-1 ::= bar
             bar ::= "{" space  (bar-b-kv )? "}" space
             bar-b-kv ::= "\"b\"" space ":" space number
-            decimal-part ::= [0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
+            decimal-part ::= [0-9]{1,16}
             foo ::= "{" space  (foo-a-kv )? "}" space
             foo-a-kv ::= "\"a\"" space ":" space number
-            integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
             number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
             root ::= alternative-0 | alternative-1
             space ::= " "?
@@ -799,8 +799,8 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             c-kv ::= "\"c\"" space ":" space number
             d-kv ::= "\"d\"" space ":" space number
             d-rest ::= ( "," space c-kv )?
-            decimal-part ::= [0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
-            integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
+            decimal-part ::= [0-9]{1,16}
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
             number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
             root ::= "{" space a-kv "," space b-kv ( "," space ( d-kv d-rest | c-kv ) )? "}" space
             space ::= " "?
@@ -842,8 +842,8 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
             "definitions": {}
         })""",
         R"""(
-            decimal-part ::= [0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
-            integral-part ::= [0-9] | [1-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9] ([0-9])?)?)?)?)?)?)?)?)?)?)?)?)?)?)?
+            decimal-part ::= [0-9]{1,16}
+            integral-part ::= [0] | [1-9] [0-9]{0,15}
             number ::= ("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space
             number- ::= "{" space number-number-kv "}" space
             number-kv ::= "\"number\"" space ":" space number-

From a143c04375828b1f72eb1a326115791b63e79345 Mon Sep 17 00:00:00 2001
From: Mattheus Chediak <shammcity00@gmail.com>
Date: Thu, 6 Jun 2024 09:17:54 -0300
Subject: [PATCH 18/39] README minor fixes (#7798) [no ci]

derievatives --> derivatives
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 9d2a59d89..09e8cad31 100644
--- a/README.md
+++ b/README.md
@@ -598,7 +598,7 @@ Building the program with BLAS support may lead to some performance improvements
 
 To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
 
-Note: `convert.py` has been moved to `examples/convert-legacy-llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derievatives.
+Note: `convert.py` has been moved to `examples/convert-legacy-llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives.
 It does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
 
 ```bash

From ad675e1c67a05b16e4e12abe30dbecfc808e7b7e Mon Sep 17 00:00:00 2001
From: Clint Herron <hanclinto@gmail.com>
Date: Thu, 6 Jun 2024 06:08:52 -0700
Subject: [PATCH 19/39] Added support for . (any character) token in grammar
 engine. (#6467)

* Added support for . (any characer) token in grammar engine.

* Add integration tests for any-character symbol.
---
 common/grammar-parser.cpp          | 11 +++++++++++
 llama.cpp                          | 12 ++++++++++--
 llama.h                            |  3 +++
 tests/test-grammar-integration.cpp | 28 ++++++++++++++++++++++++++++
 4 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/common/grammar-parser.cpp b/common/grammar-parser.cpp
index 79d2b0354..a518b766d 100644
--- a/common/grammar-parser.cpp
+++ b/common/grammar-parser.cpp
@@ -266,6 +266,10 @@ namespace grammar_parser {
                     throw std::runtime_error(std::string("expecting ')' at ") + pos);
                 }
                 pos = parse_space(pos + 1, is_nested);
+            } else if (*pos == '.') { // any char
+                last_sym_start = out_elements.size();
+                out_elements.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
+                pos = parse_space(pos + 1, is_nested);
             } else if (*pos == '*') {
                 pos = parse_space(pos + 1, is_nested);
                 handle_repetitions(0, -1);
@@ -401,6 +405,7 @@ namespace grammar_parser {
             case LLAMA_GRETYPE_CHAR_NOT:       return true;
             case LLAMA_GRETYPE_CHAR_ALT:       return true;
             case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
+            case LLAMA_GRETYPE_CHAR_ANY:       return true;
             default:                           return false;
         }
     }
@@ -415,6 +420,7 @@ namespace grammar_parser {
                 case LLAMA_GRETYPE_CHAR_NOT:       fprintf(file, "CHAR_NOT");       break;
                 case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
                 case LLAMA_GRETYPE_CHAR_ALT:       fprintf(file, "CHAR_ALT");       break;
+                case LLAMA_GRETYPE_CHAR_ANY:       fprintf(file, "CHAR_ANY");       break;
             }
             switch (elem.type) {
                 case LLAMA_GRETYPE_END:
@@ -426,6 +432,7 @@ namespace grammar_parser {
                 case LLAMA_GRETYPE_CHAR_NOT:
                 case LLAMA_GRETYPE_CHAR_RNG_UPPER:
                 case LLAMA_GRETYPE_CHAR_ALT:
+                case LLAMA_GRETYPE_CHAR_ANY:
                     fprintf(file, "(\"");
                     print_grammar_char(file, elem.value);
                     fprintf(file, "\") ");
@@ -483,11 +490,15 @@ namespace grammar_parser {
                     }
                     print_grammar_char(file, elem.value);
                     break;
+                case LLAMA_GRETYPE_CHAR_ANY:
+                    fprintf(file, ".");
+                    break;
             }
             if (is_char_element(elem)) {
                 switch (rule[i + 1].type) {
                     case LLAMA_GRETYPE_CHAR_ALT:
                     case LLAMA_GRETYPE_CHAR_RNG_UPPER:
+                    case LLAMA_GRETYPE_CHAR_ANY:
                         break;
                     default:
                         fprintf(file, "] ");
diff --git a/llama.cpp b/llama.cpp
index cefb4d1d5..32264a008 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -13640,7 +13640,7 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
         const uint32_t                chr) {
 
     bool found            = false;
-    bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
+    bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
 
     GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT); // NOLINT
 
@@ -13649,6 +13649,10 @@ static std::pair<bool, const llama_grammar_element *> llama_grammar_match_char(
             // inclusive range, e.g. [a-z]
             found = found || (pos->value <= chr && chr <= pos[1].value);
             pos += 2;
+        } else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
+            // Any character matches "."
+            found = true;
+            pos += 1;
         } else {
             // exact char match, e.g. [a] or "a"
             found = found || pos->value == chr;
@@ -13666,7 +13670,7 @@ static bool llama_grammar_match_partial_char(
         const llama_grammar_element * pos,
         const llama_partial_utf8      partial_utf8) {
 
-    bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR;
+    bool is_positive_char = pos->type == LLAMA_GRETYPE_CHAR || pos->type == LLAMA_GRETYPE_CHAR_ANY;
     GGML_ASSERT(is_positive_char || pos->type == LLAMA_GRETYPE_CHAR_NOT);
 
     uint32_t partial_value = partial_utf8.value;
@@ -13696,6 +13700,9 @@ static bool llama_grammar_match_partial_char(
                 return is_positive_char;
             }
             pos += 2;
+        } else if (pos->type == LLAMA_GRETYPE_CHAR_ANY) {
+            // Any character matches "."
+            return true;
         } else {
             // exact char match, e.g. [a] or "a"
             if (low <= pos->value && pos->value <= high) {
@@ -13756,6 +13763,7 @@ static void llama_grammar_advance_stack(
         }
         case LLAMA_GRETYPE_CHAR:
         case LLAMA_GRETYPE_CHAR_NOT:
+        case LLAMA_GRETYPE_CHAR_ANY:
             if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) {
                 // only add the stack if it's not a duplicate of one we already have
                 new_stacks.emplace_back(stack);
diff --git a/llama.h b/llama.h
index 9dcd67bef..62908261f 100644
--- a/llama.h
+++ b/llama.h
@@ -365,6 +365,9 @@ extern "C" {
         // modifies a preceding LLAMA_GRETYPE_CHAR or
         // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
         LLAMA_GRETYPE_CHAR_ALT       = 6,
+
+        // any character (.)
+        LLAMA_GRETYPE_CHAR_ANY       = 7,
     };
 
     typedef struct llama_grammar_element {
diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp
index 9bdab05af..8787fb1ec 100644
--- a/tests/test-grammar-integration.cpp
+++ b/tests/test-grammar-integration.cpp
@@ -205,6 +205,33 @@ static void test_complex_grammar() {
     );
 }
 
+static void test_special_chars() {
+    // A collection of tests to exercise special characters such as "."
+    test_grammar(
+        "special characters",
+        // Grammar
+        R"""(
+            root ::= ... "abc" ...
+            )""",
+        // Passing strings
+        {
+            "abcabcabc",
+            "aaaabcccc",
+            // NOTE: Also ensures that multi-byte characters still count as a single character
+            "🔵🟠✅abc❌🟠🔵"
+        },
+        // Failing strings
+        {
+            "aaabcccc",
+            "aaaaabcccc",
+            "aaaabccc",
+            "aaaabccccc",
+            "🔵🟠✅❌abc❌✅🟠🔵"
+            "🔵🟠abc🟠🔵"
+        }
+    );
+}
+
 static void test_quantifiers() {
     // A collection of tests to exercise * + and ? quantifiers
 
@@ -445,6 +472,7 @@ int main() {
     fprintf(stdout, "Running grammar integration tests...\n");
     test_simple_grammar();
     test_complex_grammar();
+    test_special_chars();
     test_quantifiers();
     test_failure_missing_root();
     test_failure_missing_reference();

From f83351f9a62a6262f1fc3d08f320033089cddfb5 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 6 Jun 2024 16:30:58 +0300
Subject: [PATCH 20/39] imatrix : migrate to gpt_params (#7771)

* imatrix : migrate to gpt_params

ggml-ci

* imatrix : add --save-frequency cli arg

* common : fix --no-ppl
---
 common/common.cpp            |  75 ++++++++++-
 common/common.h              |  99 +++++++-------
 examples/imatrix/README.md   |  11 +-
 examples/imatrix/imatrix.cpp | 241 +++++++++++------------------------
 examples/server/server.cpp   |   2 +-
 5 files changed, 213 insertions(+), 215 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index c8df9a4ce..601bd2164 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -273,6 +273,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
         }
     } catch (const std::invalid_argument & ex) {
         fprintf(stderr, "%s\n", ex.what());
+        params = params_org;
         return false;
     }
 
@@ -408,6 +409,20 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         }
         return true;
     }
+    if (arg == "--in-file") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        std::ifstream file(argv[i]);
+        if (!file) {
+            fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+            invalid_param = true;
+            return true;
+        }
+        params.in_files.push_back(argv[i]);
+        return true;
+    }
     if (arg == "-n" || arg == "--predict" || arg == "--n-predict") {
         if (++i >= argc) {
             invalid_param = true;
@@ -1081,7 +1096,15 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         return true;
     }
     if (arg == "-v" || arg == "--verbose") {
-        params.verbose = true;
+        params.verbosity = 1;
+        return true;
+    }
+    if (arg == "--verbosity") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.verbosity = std::stoi(argv[i]);
         return true;
     }
     if (arg == "--verbose-prompt") {
@@ -1537,6 +1560,46 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.i_pos = std::stoi(argv[i]);
         return true;
     }
+    if (arg == "-o" || arg == "--output" || arg == "--output-file") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.out_file = argv[i];
+        return true;
+    }
+    if (arg == "-ofreq" || arg == "--output-frequency") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.n_out_freq = std::stoi(argv[i]);
+        return true;
+    }
+    if (arg == "--save-frequency") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.n_save_freq = std::stoi(argv[i]);
+        return true;
+    }
+    if (arg == "--process-output") {
+        params.process_output = true;
+        return true;
+    }
+    if (arg == "--no-ppl") {
+        params.compute_ppl = false;
+        return true;
+    }
+    if (arg == "--chunk" || arg == "--from-chunk") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.i_chunk = std::stoi(argv[i]);
+        return true;
+    }
 #ifndef LOG_DISABLE_LOGS
     // Parse args for logging parameters
     if (log_param_single_parse(argv[i])) {
@@ -1612,6 +1675,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "*",           "-h,    --help, --usage",        "print usage and exit" });
     options.push_back({ "*",           "       --version",              "show version and build info" });
     options.push_back({ "*",           "-v,    --verbose",              "print verbose information" });
+    options.push_back({ "*",           "       --verbosity N",          "set specific verbosity level (default: %d)", params.verbosity });
     options.push_back({ "*",           "       --verbose-prompt",       "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" });
     options.push_back({ "*",           "       --no-display-prompt",    "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
     options.push_back({ "*",           "-co,   --color",                "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
@@ -1637,6 +1701,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "*",           "-fa,   --flash-attn",           "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" });
     options.push_back({ "*",           "-p,    --prompt PROMPT",        "prompt to start generation with (default: '%s')", params.prompt.c_str() });
     options.push_back({ "*",           "-f,    --file FNAME",           "a file containing the prompt (default: none)" });
+    options.push_back({ "*",           "       --in-file FNAME",        "an input file (repeat to specify multiple files)" });
     options.push_back({ "*",           "-bf,   --binary-file FNAME",    "binary file containing the prompt (default: none)" });
     options.push_back({ "*",           "-e,    --escape",               "process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false" });
     options.push_back({ "*",           "       --no-escape",            "do not process escape sequences" });
@@ -1804,6 +1869,14 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "passkey",     "       --junk N",               "number of times to repeat the junk text (default: %d)", params.n_junk });
     options.push_back({ "passkey",     "       --pos N",                "position of the passkey in the junk text (default: %d)", params.i_pos });
 
+    options.push_back({ "imatrix" });
+    options.push_back({ "imatrix",     "-o,    --output FNAME",         "output file (default: '%s')", params.out_file.c_str() });
+    options.push_back({ "imatrix",     "       --output-frequency N",   "output the imatrix every N iterations (default: %d)", params.n_out_freq });
+    options.push_back({ "imatrix",     "       --save-frequency N",     "save an imatrix copy every N iterations (default: %d)", params.n_save_freq });
+    options.push_back({ "imatrix",     "       --process-output",       "collect data for the output tensor (default: %s)", params.process_output ? "true" : "false" });
+    options.push_back({ "imatrix",     "       --no-ppl",               "do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false" });
+    options.push_back({ "imatrix",     "       --chunk N",              "start processing the input from chunk N (default: %d)", params.i_chunk });
+
     options.push_back({ "bench" });
     options.push_back({ "bench",       "-pps",                          "is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false" });
     options.push_back({ "bench",       "-npp n0,n1,...",                "number of prompt tokens" });
diff --git a/common/common.h b/common/common.h
index e0a08a61b..de6238e27 100644
--- a/common/common.h
+++ b/common/common.h
@@ -56,43 +56,42 @@ struct gpt_params {
     uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed
 
     int32_t n_threads             = cpu_get_num_math();
-    int32_t n_threads_draft       = -1;
-    int32_t n_threads_batch       = -1;    // number of threads to use for batch processing (-1 = use n_threads)
-    int32_t n_threads_batch_draft = -1;
-    int32_t n_predict             = -1;    // new tokens to predict
-    int32_t n_ctx                 = 0;     // context size
-    int32_t n_batch               = 2048;  // logical batch size for prompt processing (must be >=32 to use BLAS)
-    int32_t n_ubatch              = 512;   // physical batch size for prompt processing (must be >=32 to use BLAS)
-    int32_t n_keep                = 0;     // number of tokens to keep from initial prompt
-    int32_t n_draft               = 5;     // number of tokens to draft during speculative decoding
-    int32_t n_chunks              = -1;    // max number of chunks to process (-1 = unlimited)
-    int32_t n_parallel            = 1;     // number of parallel sequences to decode
-    int32_t n_sequences           = 1;     // number of sequences to decode
-    float   p_split               = 0.1f;  // speculative decoding split probability
-    int32_t n_gpu_layers          = -1;    // number of layers to store in VRAM (-1 - use default)
-    int32_t n_gpu_layers_draft    = -1;    // number of layers to store in VRAM for the draft model (-1 - use default)
-    llama_split_mode split_mode   = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
-    int32_t main_gpu              = 0;     // the GPU that is used for scratch and small tensors
-    float   tensor_split[128]     = {0};   // how split tensors should be distributed across GPUs
-    int32_t n_beams               = 0;     // if non-zero then use beam search of given width.
-    int32_t grp_attn_n            = 1;     // group-attention factor
-    int32_t grp_attn_w            = 512;   // group-attention width
-    int32_t n_print               = -1;    // print token count every n tokens (-1 = disabled)
-    float   rope_freq_base        = 0.0f;  // RoPE base frequency
-    float   rope_freq_scale       = 0.0f;  // RoPE frequency scaling factor
+    int32_t n_threads_draft       =    -1;
+    int32_t n_threads_batch       =    -1; // number of threads to use for batch processing (-1 = use n_threads)
+    int32_t n_threads_batch_draft =    -1;
+    int32_t n_predict             =    -1; // new tokens to predict
+    int32_t n_ctx                 =     0; // context size
+    int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_ubatch              =   512; // physical batch size for prompt processing (must be >=32 to use BLAS)
+    int32_t n_keep                =     0; // number of tokens to keep from initial prompt
+    int32_t n_draft               =     5; // number of tokens to draft during speculative decoding
+    int32_t n_chunks              =    -1; // max number of chunks to process (-1 = unlimited)
+    int32_t n_parallel            =     1; // number of parallel sequences to decode
+    int32_t n_sequences           =     1; // number of sequences to decode
+    float   p_split               =  0.1f; // speculative decoding split probability
+    int32_t n_gpu_layers          =    -1; // number of layers to store in VRAM (-1 - use default)
+    int32_t n_gpu_layers_draft    =    -1; // number of layers to store in VRAM for the draft model (-1 - use default)
+    int32_t main_gpu              =     0; // the GPU that is used for scratch and small tensors
+    float   tensor_split[128]     =   {0}; // how split tensors should be distributed across GPUs
+    int32_t n_beams               =     0; // if non-zero then use beam search of given width.
+    int32_t grp_attn_n            =     1; // group-attention factor
+    int32_t grp_attn_w            =   512; // group-attention width
+    int32_t n_print               =    -1; // print token count every n tokens (-1 = disabled)
+    float   rope_freq_base        =  0.0f; // RoPE base frequency
+    float   rope_freq_scale       =  0.0f; // RoPE frequency scaling factor
     float   yarn_ext_factor       = -1.0f; // YaRN extrapolation mix factor
-    float   yarn_attn_factor      = 1.0f;  // YaRN magnitude scaling factor
+    float   yarn_attn_factor      =  1.0f; // YaRN magnitude scaling factor
     float   yarn_beta_fast        = 32.0f; // YaRN low correction dim
-    float   yarn_beta_slow        = 1.0f;  // YaRN high correction dim
-    int32_t yarn_orig_ctx         = 0;     // YaRN original context length
+    float   yarn_beta_slow        =  1.0f; // YaRN high correction dim
+    int32_t yarn_orig_ctx         =     0; // YaRN original context length
     float   defrag_thold          = -1.0f; // KV cache defragmentation threshold
-    std::string rpc_servers       = "";    // comma separated list of RPC servers
 
     ggml_backend_sched_eval_callback cb_eval = nullptr;
     void * cb_eval_user_data                 = nullptr;
 
     ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
 
+    enum llama_split_mode        split_mode        = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
     enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
     enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
 
@@ -114,7 +113,9 @@ struct gpt_params {
     std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding
     std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
     std::string logits_file          = ""; // file for saving *all* logits
+    std::string rpc_servers          = ""; // comma separated list of RPC servers
 
+    std::vector<std::string> in_files;   // all input files
     std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
     std::vector<llama_model_kv_override> kv_overrides;
 
@@ -124,23 +125,24 @@ struct gpt_params {
 
     std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
 
+    int32_t verbosity                  = 0;
     int32_t control_vector_layer_start = -1; // layer range for control vector
     int32_t control_vector_layer_end   = -1; // layer range for control vector
 
-    int32_t ppl_stride      = 0;    // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
-    int32_t ppl_output_type = 0;    // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
-                                    //                                       (which is more convenient to use for plotting)
-                                    //
-    bool   hellaswag       = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
-    size_t hellaswag_tasks = 400;   // number of tasks to use when computing the HellaSwag score
+    int32_t ppl_stride      = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
+    int32_t ppl_output_type = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
+                                     //                                       (which is more convenient to use for plotting)
+                                     //
+    bool   hellaswag        = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
+    size_t hellaswag_tasks  = 400;   // number of tasks to use when computing the HellaSwag score
 
-    bool   winogrande      = false; // compute Winogrande score over random tasks from datafile supplied in prompt
-    size_t winogrande_tasks= 0;     // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
+    bool   winogrande       = false; // compute Winogrande score over random tasks from datafile supplied in prompt
+    size_t winogrande_tasks = 0;     // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
 
-    bool   multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
-    size_t multiple_choice_tasks = 0;     // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
+    bool   multiple_choice  = false;  // compute TruthfulQA score over random tasks from datafile supplied in prompt
+    size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
 
-    bool   kl_divergence   = false; // compute KL divergence
+    bool   kl_divergence    = false; // compute KL divergence
 
     bool usage             = false; // print usage
     bool use_color         = false; // use color to distinguish generations and inputs
@@ -163,7 +165,6 @@ struct gpt_params {
     bool logits_all        = false; // return logits for all tokens in the batch
     bool use_mmap          = true;  // use mmap for faster loads
     bool use_mlock         = false; // use mlock to keep model in memory
-    bool verbose           = false;
     bool verbose_prompt    = false; // print prompt tokens before generation
     bool display_prompt    = true;  // print prompt before generation
     bool infill            = false; // use infill mode
@@ -180,10 +181,10 @@ struct gpt_params {
     std::vector<std::string> image; // path to image file(s)
 
     // server params
-    int32_t port           = 8080;
-    int32_t timeout_read   = 600;
-    int32_t timeout_write  = timeout_read;
-    int32_t n_threads_http = -1;
+    int32_t port           = 8080;         // server listens on this network port
+    int32_t timeout_read   = 600;          // http read timeout in seconds
+    int32_t timeout_write  = timeout_read; // http write timeout in seconds
+    int32_t n_threads_http = -1;           // number of threads to use for http server (-1 = use n_threads)
 
     std::string hostname      = "127.0.0.1";
     std::string public_path   = "";
@@ -219,6 +220,16 @@ struct gpt_params {
     // passkey params
     int32_t n_junk = 250; // number of times to repeat the junk text
     int32_t i_pos  = -1;  // position of the passkey in the junk text
+
+    // imatrix params
+    std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
+
+    int32_t n_out_freq  = 10; // output the imatrix every n_out_freq iterations
+    int32_t n_save_freq =  0; // save the imatrix every n_save_freq iterations
+    int32_t i_chunk     =  0; // start processing from this chunk
+
+    bool process_output = false; // collect data for the output tensor
+    bool compute_ppl    = true;  // whether to compute perplexity
 };
 
 void gpt_params_handle_model_default(gpt_params & params);
diff --git a/examples/imatrix/README.md b/examples/imatrix/README.md
index 458c01b87..866ca9f56 100644
--- a/examples/imatrix/README.md
+++ b/examples/imatrix/README.md
@@ -6,16 +6,19 @@ More information is available here: https://github.com/ggerganov/llama.cpp/pull/
 ## Usage
 
 ```
-./imatrix -m <some_fp_model> -f <some_training_data> [-o <output_file>] [--verbosity <verbosity_level>]
-        [-ofreq num_chunks] [-ow <0 or 1>] [other common params]
+./imatrix \
+    -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \
+    [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \
+    [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]
 ```
 
 Here `-m` with a model name and `-f` with a file containing training data (such as e.g. `wiki.train.raw`) are mandatory.
 The parameters in square brackets are optional and have the following meaning:
 * `-o` (or `--output-file`) specifies the name of the file where the computed data will be stored. If missing `imatrix.dat` is used.
 * `--verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`.
-* `-ofreq` (or `--output-frequency`) specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
-* `-ow` (or `--output-weight`) specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
+* `--output-frequency` specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
+* `--save-frequency` specifies how often to save a copy of the imatrix in a separate file. Default is 0 (i.e., never)
+* `--process-output` specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
 
 For faster computation, make sure to use GPU offloading via the `-ngl` argument
 
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index e050c09d2..38420041c 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -17,39 +17,37 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+    gpt_params_print_usage(argc, argv, params);
+
+    LOG_TEE("\nexample usage:\n");
+    LOG_TEE("\n    %s \\\n"
+            "       -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n"
+            "       [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
+            "       [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
+    LOG_TEE("\n");
+}
+
 struct Stats {
     std::vector<float> values;
     std::vector<int> counts;
     int ncall = 0;
 };
 
-struct StatParams {
-    std::string dataset;
-    std::string ofile = "imatrix.dat";
-    int         n_output_frequency = 10;
-    int         verbosity = 1;
-    int         keep_every = 0;
-    bool        collect_output_weight = false;
-};
-
 class IMatrixCollector {
 public:
     IMatrixCollector() = default;
-    void set_parameters(StatParams&& params) { m_params = std::move(params); }
+    void set_params(gpt_params params) { m_params = std::move(params); }
     bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
-    void save_imatrix() const;
-    bool load_imatrix(const char * file_name, bool add);
-    static bool load_imatrix(const char * file_name, std::unordered_map<std::string, Stats>& imatrix);
+    void save_imatrix(int ncall = -1) const;
+    bool load_imatrix(const char * file_name);
 private:
     std::unordered_map<std::string, Stats> m_stats;
-    StatParams                             m_params;
+    gpt_params                             m_params;
     std::mutex                             m_mutex;
     int                                    m_last_call = 0;
     std::vector<float>                     m_src1_data;
     std::vector<char>                      m_ids; // the expert ids from ggml_mul_mat_id
-                                                  //
-    void save_imatrix(const char * file_name, const char * dataset) const;
-    void keep_imatrix(int ncall) const;
 };
 
 // remove any prefix and suffixes from the name
@@ -85,7 +83,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
         if (t->op != GGML_OP_MUL_MAT) return false;
         // why are small batches ignored (<16 tokens)?
         if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
-        if (!(wname.substr(0, 4) == "blk." || (m_params.collect_output_weight && wname == "output.weight"))) return false;
+        if (!(wname.substr(0, 4) == "blk." || (m_params.process_output && wname == "output.weight"))) return false;
         return true;
     }
 
@@ -158,16 +156,16 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
             }
             if (e.ncall > m_last_call) {
                 m_last_call = e.ncall;
-                if (m_last_call % m_params.n_output_frequency == 0) {
+                if (m_last_call % m_params.n_out_freq == 0) {
                     save_imatrix();
                 }
-                if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
-                    keep_imatrix(m_last_call);
+                if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
+                    save_imatrix(m_last_call);
                 }
             }
         }
     } else {
-        auto& e = m_stats[wname];
+        auto & e = m_stats[wname];
         if (e.values.empty()) {
             e.values.resize(src1->ne[0], 0);
             e.counts.resize(src1->ne[0], 0);
@@ -189,11 +187,11 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
         }
         if (e.ncall > m_last_call) {
             m_last_call = e.ncall;
-            if (m_last_call % m_params.n_output_frequency == 0) {
+            if (m_last_call % m_params.n_out_freq == 0) {
                 save_imatrix();
             }
-            if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
-                keep_imatrix(m_last_call);
+            if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
+                save_imatrix(m_last_call);
             }
         }
     }
@@ -201,19 +199,17 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
     return true;
 }
 
-void IMatrixCollector::save_imatrix() const {
-    save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str(), m_params.dataset.c_str());
-}
+void IMatrixCollector::save_imatrix(int ncall) const {
+    auto fname = m_params.out_file;
+    if (fname.empty()) {
+        fname = "imatrix.dat";
+    }
 
-void IMatrixCollector::keep_imatrix(int ncall) const {
-    auto file_name = m_params.ofile;
-    if (file_name.empty()) file_name = "imatrix.dat";
-    file_name += ".at_";
-    file_name += std::to_string(ncall);
-    save_imatrix(file_name.c_str(), m_params.dataset.c_str());
-}
+    if (ncall > 0) {
+        fname += ".at_";
+        fname += std::to_string(ncall);
+    }
 
-void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) const {
     std::ofstream out(fname, std::ios::binary);
     int n_entries = m_stats.size();
     out.write((const char *) &n_entries, sizeof(n_entries));
@@ -236,26 +232,28 @@ void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) co
     // Write the number of call the matrix was computed with
     out.write((const char *) &m_last_call, sizeof(m_last_call));
 
-    // Write the dataset name at the end of the file to later on specify it in quantize
-    int n_dataset = strlen(dataset);
-    out.write((const char *) &n_dataset, sizeof(n_dataset));
-    out.write(dataset, n_dataset);
+    // Write the input filename at the end of the file to later on specify it in quantize
+    {
+        int len = m_params.prompt_file.size();
+        out.write((const char *) &len, sizeof(len));
+        out.write(m_params.prompt_file.c_str(), len);
+    }
 
     if (m_params.verbosity > 0) {
-        fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname);
+        fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
     }
 }
 
-bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_map<std::string, Stats>& imatrix_data) {
-    std::ifstream in(imatrix_file, std::ios::binary);
+bool IMatrixCollector::load_imatrix(const char * fname) {
+    std::ifstream in(fname, std::ios::binary);
     if (!in) {
-        printf("%s: failed to open %s\n",__func__,imatrix_file);
+        printf("%s: failed to open %s\n",__func__, fname);
         return false;
     }
     int n_entries;
     in.read((char*)&n_entries, sizeof(n_entries));
     if (in.fail() || n_entries < 1) {
-        printf("%s: no data in file %s\n", __func__, imatrix_file);
+        printf("%s: no data in file %s\n", __func__, fname);
         return false;
     }
     for (int i = 0; i < n_entries; ++i) {
@@ -263,23 +261,22 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
         std::vector<char> name_as_vec(len+1);
         in.read((char *)name_as_vec.data(), len);
         if (in.fail()) {
-            printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file);
+            printf("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
             return false;
         }
         name_as_vec[len] = 0;
         std::string name{name_as_vec.data()};
-        auto& e = imatrix_data[std::move(name)];
+        auto & e = m_stats[std::move(name)];
         int ncall;
         in.read((char*)&ncall, sizeof(ncall));
         int nval;
         in.read((char *)&nval, sizeof(nval));
         if (in.fail() || nval < 1) {
             printf("%s: failed reading number of values for entry %d\n",__func__,i);
-            imatrix_data = {};
+            m_stats = {};
             return false;
         }
 
-        // When re-called from load_imatrix() with add set, this will already be created.
         if (e.values.empty()) {
             e.values.resize(nval, 0);
             e.counts.resize(nval, 0);
@@ -289,7 +286,7 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
         in.read((char*)tmp.data(), nval*sizeof(float));
         if (in.fail()) {
             printf("%s: failed reading data for entry %d\n",__func__,i);
-            imatrix_data = {};
+            m_stats = {};
             return false;
         }
 
@@ -304,13 +301,6 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
     return true;
 }
 
-bool IMatrixCollector::load_imatrix(const char * file_name, bool add) {
-    if (!add) {
-        m_stats.clear();
-    }
-    return load_imatrix(file_name, m_stats);
-}
-
 static IMatrixCollector g_collector;
 
 static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
@@ -324,7 +314,7 @@ struct results_log_softmax {
     float  prob;
 };
 
-static std::vector<float> softmax(const std::vector<float>& logits) {
+static std::vector<float> softmax(const std::vector<float> & logits) {
     std::vector<float> probs(logits.size());
     float max_logit = logits[0];
     for (float v : logits) {
@@ -358,8 +348,7 @@ static results_log_softmax log_softmax(int n_vocab, const float * logits, int to
 
 static void process_logits(
     int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
-    double & nll, double & nll2, float * logit_history, float * prob_history
-) {
+    double & nll, double & nll2, float * logit_history, float * prob_history) {
     std::mutex mutex;
     int counter = 0;
     auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
@@ -391,8 +380,7 @@ static void process_logits(
     }
 }
 
-static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) {
-
+static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
     const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
     GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
     const int n_ctx = llama_n_ctx(ctx);
@@ -405,13 +393,13 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
     auto tim2 = std::chrono::high_resolution_clock::now();
     fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
 
-    if (from_chunk > 0) {
-        if (size_t((from_chunk + 2)*n_ctx) >= tokens.size()) {
-            fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, from_chunk);
+    if (params.i_chunk > 0) {
+        if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) {
+            fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
             return false;
         }
-        fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, from_chunk, from_chunk*n_ctx);
-        tokens.erase(tokens.begin(), tokens.begin() + from_chunk*n_ctx);
+        fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
+        tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx);
     }
 
     if (int(tokens.size()) < 2*n_ctx) {
@@ -424,7 +412,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
     std::vector<float> logit_history;
     std::vector<float> prob_history;
 
-    if (compute_ppl) {
+    if (params.compute_ppl) {
         logit_history.resize(tokens.size());
         prob_history.resize(tokens.size());
     }
@@ -446,7 +434,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
     const int num_batches = (n_ctx + n_batch - 1) / n_batch;
 
     std::vector<float> logits;
-    if (compute_ppl && num_batches > 1) {
+    if (params.compute_ppl && num_batches > 1) {
         logits.reserve((size_t)n_ctx * n_vocab);
     }
 
@@ -482,7 +470,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
             // restore the original token in case it was set to BOS
             tokens[batch_start] = token_org;
 
-            if (compute_ppl && num_batches > 1) {
+            if (params.compute_ppl && num_batches > 1) {
                 const auto * batch_logits = llama_get_logits(ctx);
                 logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
             }
@@ -501,7 +489,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
             fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
         }
 
-        if (compute_ppl) {
+        if (params.compute_ppl) {
             const int first = n_ctx/2;
             const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
             process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
@@ -516,7 +504,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
     }
     printf("\n");
 
-    if (compute_ppl) {
+    if (params.compute_ppl) {
         nll2 /= count;
         nll /= count;
         const double ppl = exp(nll);
@@ -533,109 +521,32 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
 }
 
 int main(int argc, char ** argv) {
-    StatParams sparams;
-    std::string prev_result_file;
-    std::string combine_files;
-    bool compute_ppl = true;
-    int  from_chunk  = 0;
-    std::vector<char*> args;
-    args.push_back(argv[0]);
-    int iarg = 1;
-    for (; iarg < argc-1; ++iarg) {
-        std::string arg{argv[iarg]};
-        if (arg == "-o" || arg == "--output-file") {
-            sparams.ofile = argv[++iarg];
-        }
-        else if (arg == "-ofreq" || arg == "--output-frequency") {
-            sparams.n_output_frequency = std::stoi(argv[++iarg]);
-        }
-        else if (arg == "-ow" || arg == "--output-weight") {
-            sparams.collect_output_weight = std::stoi(argv[++iarg]);
-        }
-        else if (arg == "--verbosity") {
-            sparams.verbosity = std::stoi(argv[++iarg]);
-        } else if (arg == "--no-ppl") {
-            compute_ppl = false;
-        } else if (arg == "--keep-imatrix") {
-            sparams.keep_every = std::stoi(argv[++iarg]);
-        } else if (arg == "--continue-from") {
-            prev_result_file = argv[++iarg];
-        } else if (arg == "--combine") {
-            combine_files = argv[++iarg];
-        }
-        else if (arg == "--from-chunk") {
-            from_chunk = std::stoi(argv[++iarg]);
-        } else {
-            args.push_back(argv[iarg]);
-        }
-    }
-    if (iarg < argc) {
-        std::string arg{argv[iarg]};
-        if (arg == "--no-ppl") {
-            compute_ppl = false;
-        } else {
-            args.push_back(argv[iarg]);
-        }
-    }
-
     gpt_params params;
-    params.n_batch = 512;
+
+    params.n_ctx = 512;
+    params.logits_all = true;
+    params.verbosity = 1;
 
     if (!gpt_params_parse(argc, argv, params)) {
-        gpt_params_print_usage(argc, argv, params);
+        print_usage(argc, argv, params);
         return 1;
     }
 
-    params.logits_all = true;
     params.n_batch = std::min(params.n_batch, params.n_ctx);
 
-    print_build_info();
+    g_collector.set_params(params);
 
-    if (params.seed == LLAMA_DEFAULT_SEED) {
-        params.seed = time(NULL);
-    }
-
-    fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed);
-
-    std::mt19937 rng(params.seed);
-
-    sparams.dataset = params.prompt_file;
-    g_collector.set_parameters(std::move(sparams));
-
-    if (!combine_files.empty()) {
-        std::vector<std::string> files;
-        size_t pos = 0;
-        while (true) {
-            auto new_pos = combine_files.find(',', pos);
-            if (new_pos != std::string::npos) {
-                files.emplace_back(combine_files.substr(pos, new_pos - pos));
-                pos = new_pos + 1;
-            } else {
-                files.emplace_back(combine_files.substr(pos));
-                break;
-            }
-        }
-        if (files.size() < 2) {
-            fprintf(stderr, "You must provide at least two comma separated files to use --combine\n");
+    for (const auto & in_file : params.in_files) {
+        printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
+        if (!g_collector.load_imatrix(in_file.c_str())) {
+            fprintf(stderr, "%s : failed to load %s\n", __func__, in_file.c_str());
             return 1;
         }
-        printf("Combining the following %d files\n", int(files.size()));
-        for (auto& file : files) {
-            printf("    %s\n", file.c_str());
-            if (!g_collector.load_imatrix(file.c_str(), true)) {
-                fprintf(stderr, "Failed to load %s\n", file.c_str());
-                return 1;
-            }
-        }
+    }
+
+    if (params.in_files.size() > 1) {
+        printf("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
         g_collector.save_imatrix();
-        return 0;
-    }
-
-    if (!prev_result_file.empty()) {
-        if (!g_collector.load_imatrix(prev_result_file.c_str(), false)) {
-            fprintf(stderr, "=============== Failed to load %s\n", prev_result_file.c_str());
-            return 1;
-        }
     }
 
     llama_backend_init();
@@ -650,6 +561,7 @@ int main(int argc, char ** argv) {
     // init
     llama_model * model;
     llama_context * ctx;
+
     std::tie(model, ctx) = llama_init_from_gpt_params(params);
     if (model == nullptr || ctx == nullptr) {
         fprintf(stderr, "%s : failed to init\n", __func__);
@@ -668,8 +580,7 @@ int main(int argc, char ** argv) {
         fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
     }
 
-    bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk);
-    if (!OK) {
+    if (!compute_imatrix(ctx, params)) {
         return 1;
     }
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index d581cad95..74da81dad 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2360,7 +2360,7 @@ int main(int argc, char ** argv) {
 
     // TODO: not great to use extern vars
     server_log_json = params.log_json;
-    server_verbose = params.verbose;
+    server_verbose = params.verbosity > 0;
 
     // struct that contains llama context and inference
     server_context ctx_server;

From ee459f40f65810a810151b24eba5b8bd174ceffe Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 6 Jun 2024 19:19:59 +0300
Subject: [PATCH 21/39] server : fix --threads-http arg (#7801)

---
 common/common.cpp | 9 +++++++++
 common/common.h   | 2 +-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/common/common.cpp b/common/common.cpp
index 601bd2164..cdcb352b5 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1414,6 +1414,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.timeout_write = std::stoi(argv[i]);
         return true;
     }
+    if (arg == "--threads-http") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.n_threads_http = std::stoi(argv[i]);
+        return true;
+    }
     if (arg == "-spf" || arg == "--system-prompt-file") {
         if (++i >= argc) {
             invalid_param = true;
@@ -1893,6 +1901,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
     options.push_back({ "server",      "       --ssl-key-file FNAME",   "path to file a PEM-encoded SSL private key" });
     options.push_back({ "server",      "       --ssl-cert-file FNAME",  "path to file a PEM-encoded SSL certificate" });
     options.push_back({ "server",      "       --timeout N",            "server read/write timeout in seconds (default: %d)", params.timeout_read });
+    options.push_back({ "server",      "       --threads-http N",       "number of threads used to process HTTP requests (default: %d)", params.n_threads_http });
     options.push_back({ "server",      "       --system-prompt-file FNAME",
                                                                         "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications" });
     options.push_back({ "server",      "       --log-format {text,json}",
diff --git a/common/common.h b/common/common.h
index de6238e27..35f5311e1 100644
--- a/common/common.h
+++ b/common/common.h
@@ -184,7 +184,7 @@ struct gpt_params {
     int32_t port           = 8080;         // server listens on this network port
     int32_t timeout_read   = 600;          // http read timeout in seconds
     int32_t timeout_write  = timeout_read; // http write timeout in seconds
-    int32_t n_threads_http = -1;           // number of threads to use for http server (-1 = use n_threads)
+    int32_t n_threads_http = -1;           // number of threads to process HTTP requests
 
     std::string hostname      = "127.0.0.1";
     std::string public_path   = "";

From c9ee7118d5644dd3df70ea6878b36a9761616aab Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Fri, 7 Jun 2024 08:01:29 +0200
Subject: [PATCH 22/39] check for nans in imatrix and quantize (#7807)

* imatrix : detect nan/inf values

* quantize : check imatrix for nan/inf values
---
 examples/imatrix/imatrix.cpp | 8 ++++++++
 llama.cpp                    | 8 ++++++++
 2 files changed, 16 insertions(+)

diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index 38420041c..e18f49563 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -151,6 +151,10 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
                     for (int j = 0; j < (int)src1->ne[0]; ++j) {
                         e.values[e_start + j] += x[j]*x[j];
                         e.counts[e_start + j]++;
+                        if (!std::isfinite(e.values[e_start + j])) {
+                            fprintf(stderr, "%f detected in %s\n", e.values[e_start + j], wname.c_str());
+                            exit(1);
+                        }
                     }
                 }
             }
@@ -183,6 +187,10 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
             for (int j = 0; j < (int)src1->ne[0]; ++j) {
                 e.values[j] += x[j]*x[j];
                 e.counts[j]++;
+                if (!std::isfinite(e.values[j])) {
+                    fprintf(stderr, "%f detected in %s\n", e.values[j], wname.c_str());
+                    exit(1);
+                }
             }
         }
         if (e.ncall > m_last_call) {
diff --git a/llama.cpp b/llama.cpp
index 32264a008..8b675ea99 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -15237,6 +15237,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         if (imatrix_data) {
             LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
             qs.has_imatrix = true;
+            // check imatrix for nans or infs
+            for (const auto & kv : *imatrix_data) {
+                for (float f : kv.second) {
+                    if (!std::isfinite(f)) {
+                        throw std::runtime_error(format("imatrix contains non-finite value %f\n", f));
+                    }
+                }
+            }
         }
     }
 

From d5c938cd7716b9a2ace49a43a469dfbffcff4d28 Mon Sep 17 00:00:00 2001
From: pengxin99 <pengxin.yuan@intel.com>
Date: Fri, 7 Jun 2024 14:28:26 +0800
Subject: [PATCH 23/39] [SYCL] fix softmax r2r result wrong issue (#7811)

---
 ggml-sycl.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 3ff76474d..0a645b2e1 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -9108,6 +9108,7 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
     // find the sum of exps in the block
     tmp = warp_reduce_sum(tmp, item_ct1);
     if (block_size > WARP_SIZE) {
+        item_ct1.barrier(sycl::access::fence_space::local_space);
         if (warp_id == 0) {
             buf[lane_id] = 0.f;
         }

From a5cabd76491f07494c5b8267f921c73f5e2bbfb4 Mon Sep 17 00:00:00 2001
From: woodx <124784234+woodx9@users.noreply.github.com>
Date: Fri, 7 Jun 2024 15:09:45 +0800
Subject: [PATCH 24/39] server : do not get prompt in infill mode (#7286)

* avoid to get prompt in infill mode and embedding mode

* remove embedding mode

* refactor format

---------

Co-authored-by: wudexiang <wudexiang@bytedance.com>
---
 examples/server/server.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 74da81dad..528220607 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -888,7 +888,7 @@ struct server_context {
         slot.params.input_suffix = json_value(data, "input_suffix", default_params.input_suffix);
 
         // get prompt
-        {
+        if (!task.infill) {
             const auto & prompt = data.find("prompt");
             if (prompt == data.end()) {
                 send_error(task, "Either \"prompt\" or \"messages\" must be provided", ERROR_TYPE_INVALID_REQUEST);

From 7027b27d765db95d4ac6b569d976e387a8715881 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Fri, 7 Jun 2024 11:15:49 +0200
Subject: [PATCH 25/39] server: update cache_prompt documentation [no ci]
 (#7745)

---
 examples/server/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/server/README.md b/examples/server/README.md
index 0c3db8c84..ccbdcdbdb 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -279,7 +279,7 @@ node index.js
 
     `id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot.  Default: `-1`
 
-    `cache_prompt`: Re-use previously cached prompt from the last request if possible. This may prevent re-caching the prompt from scratch.  Default: `false`
+    `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `false`
 
     `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
 

From 27615f5ab21060d96953c9c1e223051ab2188f57 Mon Sep 17 00:00:00 2001
From: intelmatt <61025942+intelmatt@users.noreply.github.com>
Date: Fri, 7 Jun 2024 05:15:07 -0700
Subject: [PATCH 26/39] cmake : fix BUILD_SHARED_LIBS=ON build (#7784)

common depends on pthreads in Linux
---
 common/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 0ec8d6d8d..171530c91 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -84,4 +84,4 @@ endif ()
 
 target_include_directories(${TARGET} PUBLIC .)
 target_compile_features(${TARGET} PUBLIC cxx_std_11)
-target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama)
+target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)

From c00fad71e507ff386d42bd74846fe06d19dd63a4 Mon Sep 17 00:00:00 2001
From: Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com>
Date: Fri, 7 Jun 2024 08:56:01 -0400
Subject: [PATCH 27/39] gguf-split : change binary multi-byte units to decimal
 (#7803)

---
 examples/gguf-split/gguf-split.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp
index e04feeae3..881f0451c 100644
--- a/examples/gguf-split/gguf-split.cpp
+++ b/examples/gguf-split/gguf-split.cpp
@@ -61,10 +61,10 @@ static size_t split_str_to_n_bytes(std::string str) {
     int n;
     if (str.back() == 'M') {
         sscanf(str.c_str(), "%d", &n);
-        n_bytes = (size_t)n * 1024 * 1024; // megabytes
+        n_bytes = (size_t)n * 1000 * 1000; // megabytes
     } else if (str.back() == 'G') {
         sscanf(str.c_str(), "%d", &n);
-        n_bytes = (size_t)n * 1024 * 1024 * 1024; // gigabytes
+        n_bytes = (size_t)n * 1000 * 1000 * 1000; // gigabytes
     } else {
         throw std::invalid_argument("error: supported units are M (megabytes) or G (gigabytes), but got: " + std::string(1, str.back()));
     }
@@ -284,7 +284,7 @@ struct split_strategy {
                 struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_out, i));
                 total_size += ggml_nbytes(t);
             }
-            total_size = total_size / 1024 / 1024; // convert to megabytes
+            total_size = total_size / 1000 / 1000; // convert to megabytes
             printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
             i_split++;
         }

From da799b41891e34aac86ce4e173f9c4c0afd4fab3 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Fri, 7 Jun 2024 19:47:49 +0200
Subject: [PATCH 28/39] vulkan : reuse parent extra for views (#7806)

* vulkan : reuse parent extra for views

* Fix validation error when multiple compute contexts are used in a graph

---------

Co-authored-by: 0cc4m <picard12@live.de>
---
 ggml-vulkan.cpp | 128 +++++++++++++++++++++---------------------------
 1 file changed, 56 insertions(+), 72 deletions(-)

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index e0c512c0d..128769177 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -345,15 +345,12 @@ struct vk_context {
 };
 
 struct ggml_tensor_extra_gpu {
-    bool ready;
-
     size_t ctx_idx;
 
     vk_buffer_ref buffer_gpu;
     uint64_t offset;
 
     void reset() {
-        ready = false;
         ctx_idx = 0;
         buffer_gpu.reset();
         offset = 0;
@@ -2949,7 +2946,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
     const uint64_t d_sz = sizeof(float) * d_ne;
 
     vk_buffer d_D = extra->buffer_gpu.lock();
-    const uint64_t d_buf_offset = extra->offset;
+    const uint64_t d_buf_offset = extra->offset + dst->view_offs;
     GGML_ASSERT(d_D != nullptr);
     GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03);
     vk_buffer d_X;
@@ -2958,12 +2955,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
     uint64_t y_buf_offset = 0;
     if (!src0_uma) {
         d_Qx = extra_src0->buffer_gpu.lock();
-        qx_buf_offset = extra_src0->offset;
+        qx_buf_offset = extra_src0->offset + src0->view_offs;
         GGML_ASSERT(d_Qx != nullptr);
     }
     if (!src1_uma) {
         d_Qy = extra_src1->buffer_gpu.lock();
-        qy_buf_offset = extra_src1->offset;
+        qy_buf_offset = extra_src1->offset + src1->view_offs;
         GGML_ASSERT(d_Qy != nullptr);
     }
     if (qx_needs_dequant) {
@@ -3114,7 +3111,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
     const uint64_t d_sz = sizeof(float) * d_ne;
 
     vk_buffer d_D = extra->buffer_gpu.lock();
-    const uint64_t d_buf_offset = extra->offset;
+    const uint64_t d_buf_offset = extra->offset + dst->view_offs;
     GGML_ASSERT(d_D != nullptr);
     vk_buffer d_X;
     uint64_t x_buf_offset = 0;
@@ -3122,12 +3119,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
     uint64_t y_buf_offset = 0;
     if(!src0_uma) {
         d_Qx = extra_src0->buffer_gpu.lock();
-        qx_buf_offset = extra_src0->offset;
+        qx_buf_offset = extra_src0->offset + src0->view_offs;
         GGML_ASSERT(d_Qx != nullptr);
     }
     if(!src1_uma) {
         d_Qy = extra_src1->buffer_gpu.lock();
-        qy_buf_offset = extra_src1->offset;
+        qy_buf_offset = extra_src1->offset + src1->view_offs;
         GGML_ASSERT(d_Qy != nullptr);
     }
     if (qx_needs_dequant) {
@@ -3246,14 +3243,14 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
     const uint64_t d_sz = sizeof(float) * d_ne;
 
     vk_buffer d_D = extra->buffer_gpu.lock();
-    const uint64_t d_buf_offset = extra->offset;
+    const uint64_t d_buf_offset = extra->offset + dst->view_offs;
     GGML_ASSERT(d_D != nullptr);
     vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
-    const uint64_t qx_buf_offset = extra_src0->offset;
+    const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs;
     GGML_ASSERT(d_Qx != nullptr);
     if (!src1_uma) {
         d_Qy = extra_src1->buffer_gpu.lock();
-        qy_buf_offset = extra_src1->offset;
+        qy_buf_offset = extra_src1->offset + src1->view_offs;
         GGML_ASSERT(d_Qx != nullptr);
     }
 
@@ -3323,14 +3320,14 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
     const uint64_t d_sz = sizeof(float) * d_ne;
 
     vk_buffer d_D = extra->buffer_gpu.lock();
-    const uint64_t d_buf_offset = extra->offset;
+    const uint64_t d_buf_offset = extra->offset + dst->view_offs;
     GGML_ASSERT(d_D != nullptr);
     vk_buffer d_Qx = extra_src0->buffer_gpu.lock();
-    const uint64_t qx_buf_offset = extra_src0->offset;
+    const uint64_t qx_buf_offset = extra_src0->offset + src0->view_offs;
     GGML_ASSERT(d_Qx != nullptr);
     if (!src1_uma) {
         d_Qy = extra_src1->buffer_gpu.lock();
-        qy_buf_offset = extra_src1->offset;
+        qy_buf_offset = extra_src1->offset + src1->view_offs;
         GGML_ASSERT(d_Qx != nullptr);
     }
 
@@ -3459,7 +3456,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
     const uint64_t d_sz = sizeof(float) * d_ne;
 
     vk_buffer d_D = extra->buffer_gpu.lock();
-    const uint64_t d_buf_offset = extra->offset;
+    const uint64_t d_buf_offset = extra->offset + dst->view_offs;
     GGML_ASSERT(d_D != nullptr);
     vk_buffer d_X;
     uint64_t x_buf_offset = 0;
@@ -3467,17 +3464,17 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
     uint64_t y_buf_offset = 0;
     if (!src0_uma) {
         d_Qx = extra_src0->buffer_gpu.lock();
-        qx_buf_offset = extra_src0->offset;
+        qx_buf_offset = extra_src0->offset + src0->view_offs;
         GGML_ASSERT(d_Qx != nullptr);
     }
     if (!src1_uma) {
         d_Qy = extra_src1->buffer_gpu.lock();
-        qy_buf_offset = extra_src1->offset;
+        qy_buf_offset = extra_src1->offset + src1->view_offs;
         GGML_ASSERT(d_Qy != nullptr);
     }
     if (!ids_uma) {
         d_ids = extra_ids->buffer_gpu.lock();
-        ids_buf_offset = extra_ids->offset;
+        ids_buf_offset = extra_ids->offset + ids->view_offs;
         GGML_ASSERT(d_ids != nullptr);
     }
     if (qx_needs_dequant) {
@@ -3636,7 +3633,7 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
     const uint64_t d_sz = sizeof(float) * d_ne;
 
     vk_buffer d_D = extra->buffer_gpu.lock();
-    const uint64_t d_buf_offset = extra->offset;
+    const uint64_t d_buf_offset = extra->offset + dst->view_offs;
     GGML_ASSERT(d_D != nullptr);
     vk_buffer d_X;
     uint64_t x_buf_offset = 0;
@@ -3644,17 +3641,17 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte
     uint64_t y_buf_offset = 0;
     if(!src0_uma) {
         d_Qx = extra_src0->buffer_gpu.lock();
-        qx_buf_offset = extra_src0->offset;
+        qx_buf_offset = extra_src0->offset + src0->view_offs;
         GGML_ASSERT(d_Qx != nullptr);
     }
     if(!src1_uma) {
         d_Qy = extra_src1->buffer_gpu.lock();
-        qy_buf_offset = extra_src1->offset;
+        qy_buf_offset = extra_src1->offset + src1->view_offs;
         GGML_ASSERT(d_Qy != nullptr);
     }
     if(!ids_uma) {
         d_ids = extra_ids->buffer_gpu.lock();
-        ids_buf_offset = extra_ids->offset;
+        ids_buf_offset = extra_ids->offset + ids->view_offs;
         GGML_ASSERT(d_ids != nullptr);
     }
     if (qx_needs_dequant) {
@@ -3769,9 +3766,9 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
     ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra;
 
     const vk_buffer src_buf = extra_src0->buffer_gpu.lock();
-    const uint64_t src_offset = extra_src0->offset;
+    const uint64_t src_offset = extra_src0->offset + src0->view_offs;
     vk_buffer dst_buf = extra->buffer_gpu.lock();
-    const uint64_t dst_offset = extra->offset;
+    const uint64_t dst_offset = extra->offset + dst->view_offs;
 
     std::vector<vk::BufferCopy> copies;
 
@@ -4062,21 +4059,21 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
     }
 
     GGML_ASSERT(d_D != nullptr);
-    uint64_t d_buf_offset = (extra->offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
+    uint64_t d_buf_offset = ((extra->offset + dst->view_offs) / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
     GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY);  // NOLINT
     if(!src0_uma) {
         d_X = extra_src0->buffer_gpu.lock();
-        x_buf_offset = extra_src0->offset;
+        x_buf_offset = extra_src0->offset + src0->view_offs;
         GGML_ASSERT(d_X != nullptr);
     }
     if (use_src1 && !src1_uma) {
         d_Y = extra_src1->buffer_gpu.lock();
-        y_buf_offset = extra_src1->offset;
+        y_buf_offset = extra_src1->offset + src1->view_offs;
         GGML_ASSERT(d_Y != nullptr);
     }
     if (use_src2 && !src2_uma) {
         d_Z = extra_src2->buffer_gpu.lock();
-        z_buf_offset = extra_src2->offset;
+        z_buf_offset = extra_src2->offset + src2->view_offs;
         GGML_ASSERT(d_Z != nullptr);
     }
 
@@ -4336,7 +4333,7 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, cons
     ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
     const uint32_t src0_type_size = ggml_type_size(src0->type);
     const uint32_t dst_type_size = ggml_type_size(dst->type);
-    const uint32_t d_offset = (extra->offset % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
+    const uint32_t d_offset = ((extra->offset + dst->view_offs) % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
 
     ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
         (uint32_t)ggml_nelements(src0),
@@ -5569,6 +5566,13 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
     const ggml_tensor * src2 = node->src[2];
 
     switch (node->op) {
+    // Return on empty ops to avoid generating a compute_ctx and setting exit_tensor
+    case GGML_OP_RESHAPE:
+    case GGML_OP_VIEW:
+    case GGML_OP_PERMUTE:
+    case GGML_OP_TRANSPOSE:
+    case GGML_OP_NONE:
+        return;
     case GGML_OP_UNARY:
         switch (ggml_get_unary_op(node)) {
         case GGML_UNARY_OP_SILU:
@@ -5590,10 +5594,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
     case GGML_OP_CPY:
     case GGML_OP_CONT:
     case GGML_OP_DUP:
-    case GGML_OP_RESHAPE:
-    case GGML_OP_VIEW:
-    case GGML_OP_PERMUTE:
-    case GGML_OP_TRANSPOSE:
     case GGML_OP_NORM:
     case GGML_OP_RMS_NORM:
     case GGML_OP_DIAG_MASK_INF:
@@ -5601,7 +5601,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
     case GGML_OP_ROPE:
     case GGML_OP_MUL_MAT:
     case GGML_OP_MUL_MAT_ID:
-    case GGML_OP_NONE:
     case GGML_OP_ARGSORT:
     case GGML_OP_SUM_ROWS:
         break;
@@ -5654,12 +5653,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
     case GGML_OP_DUP:
         ggml_vk_cpy(ctx, ctx->compute_ctx, src0, node);
 
-        break;
-    case GGML_OP_RESHAPE:
-    case GGML_OP_VIEW:
-    case GGML_OP_PERMUTE:
-    case GGML_OP_TRANSPOSE:
-    case GGML_OP_NONE:
         break;
     case GGML_OP_NORM:
         ggml_vk_norm(ctx, ctx->compute_ctx, src0, node);
@@ -5712,7 +5705,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
         return;
     }
 
-    extra->ready = true;
     extra->ctx_idx = ctx->compute_ctx->idx;
 
 #ifdef GGML_VULKAN_CHECK_RESULTS
@@ -5796,8 +5788,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
     ggml_vk_check_results_0(ctx, params, tensor);
 #endif
 
-    GGML_ASSERT(extra->ready);
-
     vk_context& subctx = ctx->gc.contexts[extra->ctx_idx];
 
     // Only run if ctx hasn't been submitted yet
@@ -5822,8 +5812,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
         subctx.out_memcpys.clear();
     }
 
-    extra->ready = false;
-
     return true;
 }
 
@@ -5943,7 +5931,9 @@ struct ggml_backend_vk_buffer_context {
 
     ~ggml_backend_vk_buffer_context() {
         ggml_vk_destroy_buffer(dev_buffer);
-        delete[] temp_tensor_extras;
+        if (temp_tensor_extras != nullptr) {
+            delete[] temp_tensor_extras;
+        }
     }
 
     ggml_tensor_extra_gpu * ggml_vk_alloc_temp_tensor_extra() {
@@ -5990,18 +5980,16 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b
 #endif
     ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
 
-    ggml_tensor_extra_gpu * extra = ctx->ggml_vk_alloc_temp_tensor_extra();
-    if (tensor->view_src != nullptr && tensor->view_src->extra != nullptr) {
+    if (tensor->view_src != nullptr) {
         GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
-        ggml_tensor_extra_gpu * extra_view = (ggml_tensor_extra_gpu *) tensor->view_src->extra;
-        extra->buffer_gpu = extra_view->buffer_gpu;
-        extra->offset = extra_view->offset + tensor->view_offs;
+        GGML_ASSERT(tensor->view_src->extra != nullptr);
+        tensor->extra = tensor->view_src->extra;
     } else {
+        ggml_tensor_extra_gpu * extra = ctx->ggml_vk_alloc_temp_tensor_extra();
         extra->buffer_gpu = ctx->dev_buffer;
         extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base;
+        tensor->extra = extra;
     }
-
-    tensor->extra = extra;
 }
 
 GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
@@ -6014,7 +6002,7 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu
 
     vk_buffer buf = extra->buffer_gpu.lock();
 
-    ggml_vk_buffer_write(ctx->ctx, buf, extra->offset + offset, data, size);
+    ggml_vk_buffer_write(ctx->ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
 }
 
 GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
@@ -6027,7 +6015,7 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu
 
     vk_buffer buf = extra->buffer_gpu.lock();
 
-    ggml_vk_buffer_read(ctx->ctx, buf, extra->offset + offset, data, size);
+    ggml_vk_buffer_read(ctx->ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
 }
 
 GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
@@ -6038,7 +6026,7 @@ GGML_CALL static bool ggml_backend_vk_buffer_cpy_tensor(ggml_backend_buffer_t bu
         vk_buffer src_buf = src_extra->buffer_gpu.lock();
         vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
 
-        ggml_vk_buffer_copy(dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src));
+        ggml_vk_buffer_copy(dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src));
 
         return true;
     }
@@ -6264,7 +6252,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
 
     vk_buffer buf = extra->buffer_gpu.lock();
 
-    ggml_vk_buffer_write_async(ctx, ctx->transfer_ctx, buf, extra->offset + offset, data, size);
+    ggml_vk_buffer_write_async(ctx, ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
 }
 
 GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
@@ -6284,7 +6272,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
 
     vk_buffer buf = extra->buffer_gpu.lock();
 
-    ggml_vk_buffer_read_async(ctx, ctx->transfer_ctx, buf, extra->offset + offset, data, size);
+    ggml_vk_buffer_read_async(ctx, ctx->transfer_ctx, buf, extra->offset + tensor->view_offs + offset, data, size);
 }
 
 GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
@@ -6305,7 +6293,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
         vk_buffer src_buf = src_extra->buffer_gpu.lock();
         vk_buffer dst_buf = dst_extra->buffer_gpu.lock();
 
-        ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src));
+        ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset + dst->view_offs, src_buf, src_extra->offset + src->view_offs, ggml_nbytes(src));
         return true;
     }
 
@@ -6478,11 +6466,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
         //         return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
         //     } break;
         case GGML_OP_ROPE:
-            {
-                const int mode = ((const int32_t *) op->op_params)[2];
-
-                return true;
-            } break;
+            return true;
         case GGML_OP_NONE:
         case GGML_OP_RESHAPE:
         case GGML_OP_VIEW:
@@ -6725,7 +6709,7 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso
         ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
 
         vk_buffer buffer_gpu = extra->buffer_gpu.lock();
-        ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset, tensor_data, tensor_size);
+        ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size);
     }
 
     std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl;
@@ -6809,7 +6793,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
         } else if (ggml_backend_buffer_is_vk(src0->buffer)) {
             ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra;
             vk_buffer buffer_gpu = extra->buffer_gpu.lock();
-            uint64_t offset = extra->offset;
+            uint64_t offset = extra->offset + src0->view_offs;
             if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) {
                 for (int i3 = 0; i3 < src0->ne[3]; i3++) {
                     for (int i2 = 0; i2 < src0->ne[2]; i2++) {
@@ -6851,7 +6835,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
         } else if (ggml_backend_buffer_is_vk(src1->buffer)) {
             ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra;
             vk_buffer buffer_gpu = extra->buffer_gpu.lock();
-            uint64_t offset = extra->offset;
+            uint64_t offset = extra->offset + src1->view_offs;
             if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) {
                 for (int i3 = 0; i3 < src1->ne[3]; i3++) {
                     for (int i2 = 0; i2 < src1->ne[2]; i2++) {
@@ -6909,7 +6893,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_
         } else if (ggml_backend_buffer_is_vk(src2->buffer)) {
             ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src2->extra;
             vk_buffer buffer_gpu = extra->buffer_gpu.lock();
-            uint64_t offset = extra->offset;
+            uint64_t offset = extra->offset + src2->view_offs;
             if (!ggml_is_contiguous(src2) && ggml_vk_dim01_contiguous(src2)) {
                 for (int i3 = 0; i3 < src2->ne[3]; i3++) {
                     for (int i2 = 0; i2 < src2->ne[2]; i2++) {
@@ -7092,11 +7076,11 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_
         ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
 
         vk_buffer buffer_gpu = extra->buffer_gpu.lock();
-        if (extra->offset + tensor_size >= buffer_gpu->size) {
-            tensor_size = buffer_gpu->size - (extra->offset);
+        if (extra->offset + tensor->view_offs + tensor_size >= buffer_gpu->size) {
+            tensor_size = buffer_gpu->size - (extra->offset + tensor->view_offs);
         }
 
-        ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset, tensor_data, tensor_size);
+        ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset + tensor->view_offs, tensor_data, tensor_size);
     }
 
     float first_error_result = -1.0f;

From 7a16ce7db2a74a223f0f3b9cee66d4539c5bce8f Mon Sep 17 00:00:00 2001
From: sasha0552 <admin@sasha0552.org>
Date: Sat, 8 Jun 2024 07:50:31 +0000
Subject: [PATCH 29/39] server : smart slot selection using Longest Common
 Prefix (#7728)

* server : Smart selection of available slot using Longest Common Substring

* add usage

* remove trailing whitespaces

* Use Longest Common Prefix (LCP) instead of LCS

* Rename argument
---
 common/common.cpp          |  10 +++
 common/common.h            |   2 +
 examples/server/server.cpp | 138 ++++++++++++++++++++++++++++++++-----
 examples/server/utils.hpp  |   7 ++
 4 files changed, 140 insertions(+), 17 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index cdcb352b5..d2a8bb69e 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1491,6 +1491,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.chat_template = argv[i];
         return true;
     }
+    if (arg == "--slot-prompt-similarity" || arg == "-sps") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.slot_prompt_similarity = std::stof(argv[i]);
+        return true;
+    }
     if (arg == "-pps") {
         params.is_pp_shared = true;
         return true;
@@ -1913,6 +1921,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
                                                                         "set custom jinja chat template (default: template taken from model's metadata)\n"
                                                                         "only commonly used templates are accepted:\n"
                                                                         "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
+    options.push_back({ "server",      "-sps,  --slot-prompt-similarity SIMILARITY",
+                                                                        "how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity });
 
 #ifndef LOG_DISABLE_LOGS
     options.push_back({ "logging" });
diff --git a/common/common.h b/common/common.h
index 35f5311e1..038f9084f 100644
--- a/common/common.h
+++ b/common/common.h
@@ -203,6 +203,8 @@ struct gpt_params {
 
     std::string slot_save_path;
 
+    float slot_prompt_similarity = 0.5f;
+
     // batched-bench params
     bool is_pp_shared = false;
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 528220607..6ffaa8d9f 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -647,6 +647,9 @@ struct server_context {
 
     server_metrics metrics;
 
+    // Necessary similarity of prompt for slot selection
+    float slot_prompt_similarity = 0.0f;
+
     ~server_context() {
         if (ctx) {
             llama_free(ctx);
@@ -795,24 +798,88 @@ struct server_context {
         return prompt_tokens;
     }
 
-    server_slot * get_slot(int id) {
-        int64_t t_last = ggml_time_us();
-
-        server_slot * last_used = nullptr;
-
+    server_slot * get_slot_by_id(int id) {
         for (server_slot & slot : slots) {
-            if (slot.id == id && slot.available()) {
+            if (slot.id == id) {
                 return &slot;
             }
-
-            // among all available slots, find the one that has been least recently used
-            if (slot.available() && slot.t_last_used < t_last) {
-                last_used = &slot;
-                t_last = slot.t_last_used;
-            }
         }
 
-        return last_used;
+        return nullptr;
+    }
+
+    server_slot * get_available_slot(const std::string & prompt) {
+        server_slot * ret = nullptr;
+
+        // find the slot that has at least n% prompt similarity
+        if (ret == nullptr && slot_prompt_similarity != 0.0f && !prompt.empty()) {
+            int max_lcp_len = 0;
+            float similarity = 0;
+
+            for (server_slot & slot : slots) {
+                // skip the slot if it is not available
+                if (!slot.available()) {
+                    continue;
+                }
+
+                // skip the slot if it does not contains prompt
+                if (!slot.prompt.is_string()) {
+                    continue;
+                }
+
+                // current slot's prompt
+                std::string slot_prompt = slot.prompt.get<std::string>();
+
+                // length of the current slot's prompt
+                int slot_prompt_len = slot_prompt.size();
+
+                // length of the Longest Common Prefix between the current slot's prompt and the input prompt
+                int lcp_len = common_part(slot_prompt, prompt);
+
+                // fraction of the common substring length compared to the current slot's prompt length
+                similarity = static_cast<float>(lcp_len) / slot_prompt_len;
+
+                // select the current slot if the criteria match
+                if (lcp_len > max_lcp_len && similarity > slot_prompt_similarity) {
+                    max_lcp_len = lcp_len;
+                    ret = &slot;
+                }
+            }
+
+            if (ret != nullptr) {
+                LOG_VERBOSE("selected slot by lcp similarity", {
+                    {"id_slot", ret->id},
+                    {"max_lcp_len", max_lcp_len},
+                    {"similarity", similarity},
+                });
+            }
+        }
+
+        // find the slot that has been least recently used
+        if (ret == nullptr) {
+            int64_t t_last = ggml_time_us();
+            for (server_slot & slot : slots) {
+                // skip the slot if it is not available
+                if (!slot.available()) {
+                    continue;
+                }
+
+                // select the current slot if the criteria match
+                if (slot.t_last_used < t_last) {
+                    t_last = slot.t_last_used;
+                    ret = &slot;
+                }
+            }
+
+            if (ret != nullptr) {
+                LOG_VERBOSE("selected slot by lru", {
+                    {"id_slot", ret->id},
+                    {"t_last", t_last},
+                });
+            }
+        }
+
+        return ret;
     }
 
     bool launch_slot_with_task(server_slot & slot, const server_task & task) {
@@ -1515,13 +1582,29 @@ struct server_context {
         switch (task.type) {
             case SERVER_TASK_TYPE_COMPLETION:
                 {
-                    server_slot * slot = get_slot(json_value(task.data, "id_slot", -1));
+                    int id_slot        = json_value(task.data, "id_slot", -1);
+                    std::string prompt = json_value(task.data, "prompt", std::string());
+
+                    server_slot * slot;
+
+                    if (id_slot != -1) {
+                        slot = get_slot_by_id(id_slot);
+                    } else {
+                        slot = get_available_slot(prompt);
+                    }
+
                     if (slot == nullptr) {
                         // if no slot is available, we defer this task for processing later
                         LOG_VERBOSE("no slot is available", {{"id_task", task.id}});
                         queue_tasks.defer(task);
                         break;
                     }
+                    if (!slot->available()) {
+                        // if requested slot is unavailable, we defer this task for processing later
+                        LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
+                        queue_tasks.defer(task);
+                        break;
+                    }
 
                     if (task.data.contains("system_prompt")) {
                         std::string sys_prompt = json_value(task.data, "system_prompt", std::string());
@@ -1638,11 +1721,17 @@ struct server_context {
             case SERVER_TASK_TYPE_SLOT_SAVE:
                 {
                     int id_slot = task.data.at("id_slot");
-                    server_slot * slot = get_slot(id_slot);
+                    server_slot * slot = get_slot_by_id(id_slot);
                     if (slot == nullptr) {
                         send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
                         break;
                     }
+                    if (!slot->available()) {
+                        // if requested slot is unavailable, we defer this task for processing later
+                        LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
+                        queue_tasks.defer(task);
+                        break;
+                    }
 
                     const size_t token_count = slot->cache_tokens.size();
                     const int64_t t_start = ggml_time_us();
@@ -1673,11 +1762,17 @@ struct server_context {
             case SERVER_TASK_TYPE_SLOT_RESTORE:
                 {
                     int id_slot = task.data.at("id_slot");
-                    server_slot * slot = get_slot(id_slot);
+                    server_slot * slot = get_slot_by_id(id_slot);
                     if (slot == nullptr) {
                         send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
                         break;
                     }
+                    if (!slot->available()) {
+                        // if requested slot is unavailable, we defer this task for processing later
+                        LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
+                        queue_tasks.defer(task);
+                        break;
+                    }
 
                     const int64_t t_start = ggml_time_us();
 
@@ -1715,11 +1810,17 @@ struct server_context {
             case SERVER_TASK_TYPE_SLOT_ERASE:
                 {
                     int id_slot = task.data.at("id_slot");
-                    server_slot * slot = get_slot(id_slot);
+                    server_slot * slot = get_slot_by_id(id_slot);
                     if (slot == nullptr) {
                         send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
                         break;
                     }
+                    if (!slot->available()) {
+                        // if requested slot is unavailable, we defer this task for processing later
+                        LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
+                        queue_tasks.defer(task);
+                        break;
+                    }
 
                     // Erase token cache
                     const size_t n_erased = slot->cache_tokens.size();
@@ -2467,6 +2568,9 @@ int main(int argc, char ** argv) {
         log_data["api_key"] = "api_key: " + std::to_string(params.api_keys.size()) + " keys loaded";
     }
 
+    // Necessary similarity of prompt for slot selection
+    ctx_server.slot_prompt_similarity = params.slot_prompt_similarity;
+
     // load the model
     if (!ctx_server.load_model(params)) {
         state.store(SERVER_STATE_ERROR);
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index b7bfb41d3..63fde9c9f 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -253,6 +253,13 @@ static size_t common_part(const std::vector<llama_token> & a, const std::vector<
     return i;
 }
 
+static size_t common_part(const std::string & a, const std::string & b) {
+    size_t i;
+    for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
+
+    return i;
+}
+
 static bool ends_with(const std::string & str, const std::string & suffix) {
     return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
 }

From d4d915d351d1f1270d56184bdd46672893e8a5d8 Mon Sep 17 00:00:00 2001
From: Olivier Chafik <ochafik@users.noreply.github.com>
Date: Sat, 8 Jun 2024 20:21:08 +0100
Subject: [PATCH 30/39] url: save -mu downloads to new cache location (#7826)

* url: save -mu download to new cache location

* url: fs_get_cache_file_path util

* url: tweak sig of fs_get_cache_file
---
 common/common.cpp | 20 ++++++++++++--------
 common/common.h   |  1 +
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index d2a8bb69e..1591790e6 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -200,19 +200,13 @@ void gpt_params_handle_model_default(gpt_params & params) {
             }
             params.hf_file = params.model;
         } else if (params.model.empty()) {
-            std::string cache_directory = fs_get_cache_directory();
-            const bool success = fs_create_directory_with_parents(cache_directory);
-            if (!success) {
-                throw std::runtime_error("failed to create cache directory: " + cache_directory);
-            }
-            params.model = cache_directory + string_split(params.hf_file, '/').back();
+            params.model = fs_get_cache_file(string_split(params.hf_file, '/').back());
         }
     } else if (!params.model_url.empty()) {
         if (params.model.empty()) {
             auto f = string_split(params.model_url, '#').front();
             f = string_split(f, '?').front();
-            f = string_split(f, '/').back();
-            params.model =  "models/" + f;
+            params.model = fs_get_cache_file(string_split(f, '/').back());
         }
     } else if (params.model.empty()) {
         params.model = DEFAULT_MODEL_PATH;
@@ -2279,6 +2273,16 @@ std::string fs_get_cache_directory() {
     return ensure_trailing_slash(cache_directory);
 }
 
+std::string fs_get_cache_file(const std::string & filename) {
+    GGML_ASSERT(filename.find(DIRECTORY_SEPARATOR) == std::string::npos);
+    std::string cache_directory = fs_get_cache_directory();
+    const bool success = fs_create_directory_with_parents(cache_directory);
+    if (!success) {
+        throw std::runtime_error("failed to create cache directory: " + cache_directory);
+    }
+    return cache_directory + filename;
+}
+
 
 //
 // Model utils
diff --git a/common/common.h b/common/common.h
index 038f9084f..2345d855e 100644
--- a/common/common.h
+++ b/common/common.h
@@ -277,6 +277,7 @@ bool fs_validate_filename(const std::string & filename);
 bool fs_create_directory_with_parents(const std::string & path);
 
 std::string fs_get_cache_directory();
+std::string fs_get_cache_file(const std::string & filename);
 
 //
 // Model utils

From fe1e3917cfa0f9397a765cfd0aef880674d938d5 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Sun, 9 Jun 2024 01:43:39 +0200
Subject: [PATCH 31/39] Revert "[SYCL] Update rpc-server.cpp to include SYCL
 backend (#7682)" (#7808)

This reverts commit 9422c5e34bbd302493b77a8f6d546154a1f4fe82.
---
 examples/rpc/rpc-server.cpp | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/examples/rpc/rpc-server.cpp b/examples/rpc/rpc-server.cpp
index 62d828250..7c15d2aa4 100644
--- a/examples/rpc/rpc-server.cpp
+++ b/examples/rpc/rpc-server.cpp
@@ -6,10 +6,6 @@
 #include "ggml-metal.h"
 #endif
 
-#ifdef GGML_USE_SYCL
-#include "ggml-sycl.h"
-#endif
-
 #include "ggml-rpc.h"
 #ifdef _WIN32
 #  include <windows.h>
@@ -83,12 +79,6 @@ static ggml_backend_t create_backend() {
     if (!backend) {
         fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
     }
-#elif GGML_USE_SYCL
-    fprintf(stderr, "%s: using SYCL backend\n", __func__);
-    backend = ggml_backend_sycl_init(0); // init device 0
-    if (!backend) {
-        fprintf(stderr, "%s: ggml_backend_sycl_init() failed\n", __func__);
-    }
 #endif
 
     // if there aren't GPU Backends fallback to CPU backend

From ed9f2521185706481501a5e6d5315397b11802ff Mon Sep 17 00:00:00 2001
From: compilade <git@compilade.net>
Date: Sat, 8 Jun 2024 22:34:29 -0400
Subject: [PATCH 32/39] gguf-py : decouple adding metadata from writing in
 GGUFWriter (#7827)

Main changes of this PR is to consolidate GGUFWriter.add_key and GGUFWriter.add_val into GGUFWriter.add_key_value.

In addition use_temp_file is now opt-in instead of opt-out defaulting to False.

Also GGUFWriter now does not require output file name until when actually writing to it.

And GGUFWriter doesn't really need to eagerly prepare the data layout of the metadata
---
 convert-hf-to-gguf.py                |   8 +-
 gguf-py/gguf/gguf_writer.py          | 270 +++++++++++++++------------
 gguf-py/scripts/gguf-new-metadata.py |   6 +-
 3 files changed, 160 insertions(+), 124 deletions(-)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index a86864f04..0327712d7 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -47,7 +47,7 @@ class Model:
     _model_classes: dict[str, type[Model]] = {}
 
     dir_model: Path
-    ftype: int
+    ftype: gguf.LlamaFileType
     is_big_endian: bool
     endianess: gguf.GGUFEndian
     use_temp_file: bool
@@ -94,7 +94,7 @@ class Model:
         ftype_lw: str = ftype_up.lower()
         # allow templating the file name with the output ftype, useful with the "auto" ftype
         self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
-        self.gguf_writer = gguf.GGUFWriter(self.fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
+        self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
 
     @classmethod
     def __init_subclass__(cls):
@@ -324,13 +324,13 @@ class Model:
 
     def write(self):
         self.write_tensors()
-        self.gguf_writer.write_header_to_file()
+        self.gguf_writer.write_header_to_file(self.fname_out)
         self.gguf_writer.write_kv_data_to_file()
         self.gguf_writer.write_tensors_to_file(progress=True)
         self.gguf_writer.close()
 
     def write_vocab(self):
-        self.gguf_writer.write_header_to_file()
+        self.gguf_writer.write_header_to_file(self.fname_out)
         self.gguf_writer.write_kv_data_to_file()
         self.gguf_writer.close()
 
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index b93747aff..ed56abfb3 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -5,6 +5,7 @@ import os
 import shutil
 import struct
 import tempfile
+from dataclasses import dataclass
 from enum import Enum, auto
 from io import BufferedWriter
 from typing import IO, Any, Sequence, Mapping
@@ -30,17 +31,36 @@ from .quants import quant_shape_from_byte_shape
 logger = logging.getLogger(__name__)
 
 
+@dataclass
+class TensorInfo:
+    shape: Sequence[int]
+    dtype: GGMLQuantizationType
+    nbytes: int
+    tensor: np.ndarray[Any, Any] | None = None
+
+
+@dataclass
+class GGUFValue:
+    value: Any
+    type: GGUFValueType
+
+
 class WriterState(Enum):
+    NO_FILE = auto()
     EMPTY   = auto()
     HEADER  = auto()
     KV_DATA = auto()
     TI_DATA = auto()
+    WEIGHTS = auto()
 
 
 class GGUFWriter:
-    fout: BufferedWriter
+    fout: BufferedWriter | None
+    path: os.PathLike[str] | str | None
     temp_file: tempfile.SpooledTemporaryFile[bytes] | None
-    tensors: list[np.ndarray[Any, Any]]
+    tensors: dict[str, TensorInfo]
+    kv_data: dict[str, GGUFValue]
+    state: WriterState
     _simple_value_packing = {
         GGUFValueType.UINT8:   "B",
         GGUFValueType.INT8:    "b",
@@ -56,141 +76,140 @@ class GGUFWriter:
     }
 
     def __init__(
-        self, path: os.PathLike[str] | str, arch: str, use_temp_file: bool = True,
+        self, path: os.PathLike[str] | str | None, arch: str, use_temp_file: bool = False,
         endianess: GGUFEndian = GGUFEndian.LITTLE,
     ):
-        self.fout = open(path, "wb")
+        self.fout = None
+        self.path = path
         self.arch = arch
         self.endianess = endianess
-        self.offset_tensor = 0
         self.data_alignment = GGUF_DEFAULT_ALIGNMENT
-        self.kv_data = bytearray()
-        self.kv_data_count = 0
-        self.ti_data = bytearray()
-        self.ti_data_count = 0
-        self.ti_names = set()
         self.use_temp_file = use_temp_file
         self.temp_file = None
-        self.tensors = []
+        self.tensors = dict()
+        self.kv_data = dict()
         logger.info("gguf: This GGUF file is for {0} Endian only".format(
             "Big" if self.endianess == GGUFEndian.BIG else "Little",
         ))
-        self.state = WriterState.EMPTY
+        self.state = WriterState.NO_FILE
 
         self.add_architecture()
 
-    def write_header_to_file(self) -> None:
+    def open_output_file(self, path: os.PathLike[str] | str | None = None) -> None:
+        if self.state is WriterState.EMPTY and self.fout is not None and (path is None or path == self.path):
+            # allow calling this multiple times as long as the path is the same
+            return
+        if self.state is not WriterState.NO_FILE:
+            raise ValueError(f'Expected output file to be not yet opened, got {self.state}')
+
+        if path is not None:
+            self.path = path
+
+        if self.path is not None:
+            if self.fout is not None:
+                self.fout.close()
+            self.fout = open(self.path, "wb")
+            self.state = WriterState.EMPTY
+
+    def write_header_to_file(self, path: os.PathLike[str] | str | None = None) -> None:
+        self.open_output_file(path)
+
         if self.state is not WriterState.EMPTY:
             raise ValueError(f'Expected output file to be empty, got {self.state}')
 
         self._write_packed("<I", GGUF_MAGIC, skip_pack_prefix = True)
         self._write_packed("I", GGUF_VERSION)
-        self._write_packed("Q", self.ti_data_count)
-        self._write_packed("Q", self.kv_data_count)
+        self._write_packed("Q", len(self.tensors))
+        self._write_packed("Q", len(self.kv_data))
         self.flush()
         self.state = WriterState.HEADER
 
     def write_kv_data_to_file(self) -> None:
         if self.state is not WriterState.HEADER:
             raise ValueError(f'Expected output file to contain the header, got {self.state}')
+        assert self.fout is not None
 
-        self.fout.write(self.kv_data)
+        kv_data = bytearray()
+
+        for key, val in self.kv_data.items():
+            kv_data += self._pack_val(key, GGUFValueType.STRING, add_vtype=False)
+            kv_data += self._pack_val(val.value, val.type, add_vtype=True)
+
+        self.fout.write(kv_data)
         self.flush()
         self.state = WriterState.KV_DATA
 
     def write_ti_data_to_file(self) -> None:
         if self.state is not WriterState.KV_DATA:
             raise ValueError(f'Expected output file to contain KV data, got {self.state}')
+        assert self.fout is not None
 
-        self.fout.write(self.ti_data)
+        ti_data = bytearray()
+        offset_tensor = 0
+
+        for name, ti in self.tensors.items():
+            ti_data += self._pack_val(name, GGUFValueType.STRING, add_vtype=False)
+            n_dims = len(ti.shape)
+            ti_data += self._pack("I", n_dims)
+            for i in range(n_dims):
+                ti_data += self._pack("Q", ti.shape[n_dims - 1 - i])
+            ti_data += self._pack("I", ti.dtype)
+            ti_data += self._pack("Q", offset_tensor)
+            offset_tensor += GGUFWriter.ggml_pad(ti.nbytes, self.data_alignment)
+
+        self.fout.write(ti_data)
         self.flush()
         self.state = WriterState.TI_DATA
 
-    def add_key(self, key: str) -> None:
-        self.add_val(key, GGUFValueType.STRING, add_vtype=False)
+    def add_key_value(self, key: str, val: Any, vtype: GGUFValueType) -> None:
+        if key in self.kv_data:
+            raise ValueError(f'Duplicated key name {key!r}')
+
+        self.kv_data[key] = GGUFValue(value=val, type=vtype)
 
     def add_uint8(self, key: str, val: int) -> None:
-        self.add_key(key)
-        self.add_val(val, GGUFValueType.UINT8)
+        self.add_key_value(key,val, GGUFValueType.UINT8)
 
     def add_int8(self, key: str, val: int) -> None:
-        self.add_key(key)
-        self.add_val(val, GGUFValueType.INT8)
+        self.add_key_value(key, val, GGUFValueType.INT8)
 
     def add_uint16(self, key: str, val: int) -> None:
-        self.add_key(key)
-        self.add_val(val, GGUFValueType.UINT16)
+        self.add_key_value(key, val, GGUFValueType.UINT16)
 
     def add_int16(self, key: str, val: int) -> None:
-        self.add_key(key)
-        self.add_val(val, GGUFValueType.INT16)
+        self.add_key_value(key, val, GGUFValueType.INT16)
 
     def add_uint32(self, key: str, val: int) -> None:
-        self.add_key(key)
-        self.add_val(val, GGUFValueType.UINT32)
+        self.add_key_value(key, val, GGUFValueType.UINT32)
 
     def add_int32(self, key: str, val: int) -> None:
-        self.add_key(key)
-        self.add_val(val, GGUFValueType.INT32)
+        self.add_key_value(key, val, GGUFValueType.INT32)
 
     def add_float32(self, key: str, val: float) -> None:
-        self.add_key(key)
-        self.add_val(val, GGUFValueType.FLOAT32)
+        self.add_key_value(key, val, GGUFValueType.FLOAT32)
 
     def add_uint64(self, key: str, val: int) -> None:
-        self.add_key(key)
-        self.add_val(val, GGUFValueType.UINT64)
+        self.add_key_value(key, val, GGUFValueType.UINT64)
 
     def add_int64(self, key: str, val: int) -> None:
-        self.add_key(key)
-        self.add_val(val, GGUFValueType.INT64)
+        self.add_key_value(key, val, GGUFValueType.INT64)
 
     def add_float64(self, key: str, val: float) -> None:
-        self.add_key(key)
-        self.add_val(val, GGUFValueType.FLOAT64)
+        self.add_key_value(key, val, GGUFValueType.FLOAT64)
 
     def add_bool(self, key: str, val: bool) -> None:
-        self.add_key(key)
-        self.add_val(val, GGUFValueType.BOOL)
+        self.add_key_value(key, val, GGUFValueType.BOOL)
 
     def add_string(self, key: str, val: str) -> None:
         if not val:
             return
-        self.add_key(key)
-        self.add_val(val, GGUFValueType.STRING)
+        self.add_key_value(key, val, GGUFValueType.STRING)
 
     def add_array(self, key: str, val: Sequence[Any]) -> None:
         if not isinstance(val, Sequence):
             raise ValueError("Value must be a sequence for array type")
 
-        self.add_key(key)
-        self.add_val(val, GGUFValueType.ARRAY)
-
-    def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True) -> None:
-        if vtype is None:
-            vtype = GGUFValueType.get_type(val)
-
-        if add_vtype:
-            self.kv_data += self._pack("I", vtype)
-            self.kv_data_count += 1
-
-        pack_fmt = self._simple_value_packing.get(vtype)
-        if pack_fmt is not None:
-            self.kv_data += self._pack(pack_fmt, val, skip_pack_prefix = vtype == GGUFValueType.BOOL)
-        elif vtype == GGUFValueType.STRING:
-            encoded_val = val.encode("utf-8") if isinstance(val, str) else val
-            self.kv_data += self._pack("Q", len(encoded_val))
-            self.kv_data += encoded_val
-        elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val:
-            ltype = GGUFValueType.get_type(val[0])
-            if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
-                raise ValueError("All items in a GGUF array should be of the same type")
-            self.kv_data += self._pack("I", ltype)
-            self.kv_data += self._pack("Q", len(val))
-            for item in val:
-                self.add_val(item, add_vtype=False)
-        else:
-            raise ValueError("Invalid GGUF metadata value type or value")
+        self.add_key_value(key, val, GGUFValueType.ARRAY)
 
     @staticmethod
     def ggml_pad(x: int, n: int) -> int:
@@ -200,16 +219,12 @@ class GGUFWriter:
         self, name: str, tensor_shape: Sequence[int], tensor_dtype: np.dtype,
         tensor_nbytes: int, raw_dtype: GGMLQuantizationType | None = None,
     ) -> None:
-        if self.state is not WriterState.EMPTY:
-            raise ValueError(f'Expected output file to be empty, got {self.state}')
+        if self.state is not WriterState.NO_FILE:
+            raise ValueError(f'Expected output file to be not yet opened, got {self.state}')
 
-        if name in self.ti_names:
-            raise ValueError(f'Duplicated tensor name {name}')
-        self.ti_names.add(name)
+        if name in self.tensors:
+            raise ValueError(f'Duplicated tensor name {name!r}')
 
-        encoded_name = name.encode("utf-8")
-        self.ti_data += self._pack("Q", len(encoded_name))
-        self.ti_data += encoded_name
         if raw_dtype is None:
             if tensor_dtype == np.float16:
                 dtype = GGMLQuantizationType.F16
@@ -231,14 +246,8 @@ class GGUFWriter:
             dtype = raw_dtype
             if tensor_dtype == np.uint8:
                 tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype)
-        n_dims = len(tensor_shape)
-        self.ti_data += self._pack("I", n_dims)
-        for i in range(n_dims):
-            self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i])
-        self.ti_data += self._pack("I", dtype)
-        self.ti_data += self._pack("Q", self.offset_tensor)
-        self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
-        self.ti_data_count += 1
+
+        self.tensors[name] = TensorInfo(shape=tensor_shape, dtype=dtype, nbytes=tensor_nbytes)
 
     def add_tensor(
         self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None,
@@ -252,10 +261,10 @@ class GGUFWriter:
             self.temp_file = fp
 
         shape: Sequence[int] = raw_shape if raw_shape is not None else tensor.shape
-        self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype = raw_dtype)
+        self.add_tensor_info(name, shape, tensor.dtype, tensor.nbytes, raw_dtype=raw_dtype)
 
         if self.temp_file is None:
-            self.tensors.append(tensor)
+            self.tensors[name].tensor = tensor
             return
 
         tensor.tofile(self.temp_file)
@@ -267,8 +276,9 @@ class GGUFWriter:
             fp.write(bytes([0] * pad))
 
     def write_tensor_data(self, tensor: np.ndarray[Any, Any]) -> None:
-        if self.state is not WriterState.TI_DATA:
-            raise ValueError(f'Expected output file to contain tensor info, got {self.state}')
+        if self.state is not WriterState.TI_DATA and self.state is not WriterState.WEIGHTS:
+            raise ValueError(f'Expected output file to contain tensor info or weights, got {self.state}')
+        assert self.fout is not None
 
         if self.endianess == GGUFEndian.BIG:
             tensor.byteswap(inplace=True)
@@ -276,50 +286,51 @@ class GGUFWriter:
         tensor.tofile(self.fout)
         self.write_padding(self.fout, tensor.nbytes)
 
+        self.state = WriterState.WEIGHTS
+
     def write_tensors_to_file(self, *, progress: bool = False) -> None:
         self.write_ti_data_to_file()
 
+        assert self.fout is not None
+
         self.write_padding(self.fout, self.fout.tell())
 
         if self.temp_file is None:
-            self.tensors.reverse()  # to pop from the "beginning" in constant time
+            bar = None
 
             if progress:
                 from tqdm import tqdm
 
-                total_bytes = sum(t.nbytes for t in self.tensors)
+                total_bytes = sum(t.nbytes for t in self.tensors.values())
 
                 bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
 
-                while True:
-                    try:
-                        tensor = self.tensors.pop()
-                    except IndexError:
-                        break
-                    tensor.tofile(self.fout)
-                    bar.update(tensor.nbytes)
-                    self.write_padding(self.fout, tensor.nbytes)
-                return
-            while True:
-                try:
-                    tensor = self.tensors.pop()
-                except IndexError:
-                    break
-                tensor.tofile(self.fout)
-                self.write_padding(self.fout, tensor.nbytes)
-            return
+            # relying on the fact that Python dicts preserve insertion order (since 3.7)
+            for ti in self.tensors.values():
+                assert ti.tensor is not None  # can only iterate once over the tensors
+                assert ti.tensor.nbytes == ti.nbytes
+                ti.tensor.tofile(self.fout)
+                if bar is not None:
+                    bar.update(ti.nbytes)
+                self.write_padding(self.fout, ti.nbytes)
+                ti.tensor = None
+        else:
+            self.temp_file.seek(0)
 
-        self.temp_file.seek(0)
+            shutil.copyfileobj(self.temp_file, self.fout)
+            self.flush()
+            self.temp_file.close()
 
-        shutil.copyfileobj(self.temp_file, self.fout)
-        self.flush()
-        self.temp_file.close()
+        self.state = WriterState.WEIGHTS
 
     def flush(self) -> None:
+        assert self.fout is not None
         self.fout.flush()
 
     def close(self) -> None:
-        self.fout.close()
+        if self.fout is not None:
+            self.fout.close()
+            self.fout = None
 
     def add_architecture(self) -> None:
         self.add_string(Keys.General.ARCHITECTURE, self.arch)
@@ -449,7 +460,7 @@ class GGUFWriter:
     def add_rope_scaling_factor(self, value: float) -> None:
         self.add_float32(Keys.Rope.SCALING_FACTOR.format(arch=self.arch), value)
 
-    def add_rope_scaling_attn_factors(self, value: Sequence[float]) -> None:
+    def add_rope_scaling_attn_factors(self, value: float) -> None:
         self.add_float32(Keys.Rope.SCALING_ATTN_FACTOR.format(arch=self.arch), value)
 
     def add_rope_scaling_orig_ctx_len(self, value: int) -> None:
@@ -571,5 +582,32 @@ class GGUFWriter:
             pack_prefix = '<' if self.endianess == GGUFEndian.LITTLE else '>'
         return struct.pack(f'{pack_prefix}{fmt}', value)
 
+    def _pack_val(self, val: Any, vtype: GGUFValueType, add_vtype: bool) -> bytes:
+        kv_data = bytearray()
+
+        if add_vtype:
+            kv_data += self._pack("I", vtype)
+
+        pack_fmt = self._simple_value_packing.get(vtype)
+        if pack_fmt is not None:
+            kv_data += self._pack(pack_fmt, val, skip_pack_prefix = vtype == GGUFValueType.BOOL)
+        elif vtype == GGUFValueType.STRING:
+            encoded_val = val.encode("utf-8") if isinstance(val, str) else val
+            kv_data += self._pack("Q", len(encoded_val))
+            kv_data += encoded_val
+        elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val:
+            ltype = GGUFValueType.get_type(val[0])
+            if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
+                raise ValueError("All items in a GGUF array should be of the same type")
+            kv_data += self._pack("I", ltype)
+            kv_data += self._pack("Q", len(val))
+            for item in val:
+                kv_data += self._pack_val(item, ltype, add_vtype=False)
+        else:
+            raise ValueError("Invalid GGUF metadata value type or value")
+
+        return kv_data
+
     def _write_packed(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> None:
+        assert self.fout is not None
         self.fout.write(self._pack(fmt, value, skip_pack_prefix))
diff --git a/gguf-py/scripts/gguf-new-metadata.py b/gguf-py/scripts/gguf-new-metadata.py
index 21e91180c..c4b90d581 100755
--- a/gguf-py/scripts/gguf-new-metadata.py
+++ b/gguf-py/scripts/gguf-new-metadata.py
@@ -101,8 +101,7 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new
             logger.debug(f'Copying {field.name}')
 
         if val.value is not None:
-            writer.add_key(field.name)
-            writer.add_val(val.value, val.type)
+            writer.add_key_value(field.name, val.value, val.type)
 
     if gguf.Keys.Tokenizer.CHAT_TEMPLATE in new_metadata:
         logger.debug('Adding chat template(s)')
@@ -111,8 +110,7 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new
 
     for key, val in new_metadata.items():
         logger.debug(f'Adding {key}: "{val.value}" {val.description}')
-        writer.add_key(key)
-        writer.add_val(val.value, val.type)
+        writer.add_key_value(key, val.value, val.type)
 
     total_bytes = 0
 

From 5795b941827fdec6c1662986de962badff456718 Mon Sep 17 00:00:00 2001
From: compilade <git@compilade.net>
Date: Sat, 8 Jun 2024 22:47:25 -0400
Subject: [PATCH 33/39] convert-hf : match model part name prefix and suffix
 (#7687)

In #7075, to fix the conversion of (some) models using model-00001-of-00001.safetensors instead of model.safetensors for a single model part we simply used the same logic as the part count to get the part names.

But this doesn't always work correctly, like when unusual additional model files like consolidated.safetensors in https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3 are present.

This commit matching both the prefix and the suffix of the model part names should fix this problem without breaking any previously-supported upstream models. But according to report by @teleprint-me there is still some
persistent problem, but shall do in the meantime.
---
 convert-hf-to-gguf.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index 0327712d7..b38f48edf 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -73,10 +73,10 @@ class Model:
         self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
         self.use_temp_file = use_temp_file
         self.lazy = not eager
-        self.part_names = Model.get_model_part_names(self.dir_model, ".safetensors")
+        self.part_names = Model.get_model_part_names(self.dir_model, "model", ".safetensors")
         self.is_safetensors = len(self.part_names) > 0
         if not self.is_safetensors:
-            self.part_names = Model.get_model_part_names(self.dir_model, ".bin")
+            self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
         self.hparams = Model.load_hparams(self.dir_model)
         self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
         self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
@@ -335,10 +335,10 @@ class Model:
         self.gguf_writer.close()
 
     @staticmethod
-    def get_model_part_names(dir_model: Path, suffix: str) -> list[str]:
+    def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]:
         part_names: list[str] = []
         for filename in os.listdir(dir_model):
-            if filename.endswith(suffix):
+            if filename.startswith(prefix) and filename.endswith(suffix):
                 part_names.append(filename)
 
         part_names.sort()

From 2decf57bc6e4a6b45176c3727d964a01161beecc Mon Sep 17 00:00:00 2001
From: sasha0552 <admin@sasha0552.org>
Date: Sun, 9 Jun 2024 06:39:25 +0000
Subject: [PATCH 34/39] convert-hf : set the model name based on cli arg, if
 present (#7693)

 `--model-name` argument was added a while ago but did not do anything.
This commit fixes this issue and enables this feature.
---
 convert-hf-to-gguf.py | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index b38f48edf..025405a2c 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -52,6 +52,7 @@ class Model:
     endianess: gguf.GGUFEndian
     use_temp_file: bool
     lazy: bool
+    model_name: str | None
     part_names: list[str]
     is_safetensors: bool
     hparams: dict[str, Any]
@@ -64,7 +65,7 @@ class Model:
     # subclasses should define this!
     model_arch: gguf.MODEL_ARCH
 
-    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool):
+    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, model_name: str | None):
         if type(self) is Model:
             raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
         self.dir_model = dir_model
@@ -73,6 +74,7 @@ class Model:
         self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
         self.use_temp_file = use_temp_file
         self.lazy = not eager
+        self.model_name = model_name
         self.part_names = Model.get_model_part_names(self.dir_model, "model", ".safetensors")
         self.is_safetensors = len(self.part_names) > 0
         if not self.is_safetensors:
@@ -182,7 +184,7 @@ class Model:
         return new_name
 
     def set_gguf_parameters(self):
-        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
         self.gguf_writer.add_block_count(self.block_count)
 
         if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
@@ -665,7 +667,7 @@ class GPTNeoXModel(Model):
     def set_gguf_parameters(self):
         block_count = self.hparams["num_hidden_layers"]
 
-        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
         self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
         self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
         self.gguf_writer.add_block_count(block_count)
@@ -798,7 +800,7 @@ class MPTModel(Model):
 
     def set_gguf_parameters(self):
         block_count = self.hparams["n_layers"]
-        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
         self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
         self.gguf_writer.add_embedding_length(self.hparams["d_model"])
         self.gguf_writer.add_block_count(block_count)
@@ -850,7 +852,7 @@ class OrionModel(Model):
             raise ValueError("gguf: can not find ctx length parameter.")
 
         self.gguf_writer.add_file_type(self.ftype)
-        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
         self.gguf_writer.add_source_hf_repo(hf_repo)
         self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
         self.gguf_writer.add_context_length(ctx_length)
@@ -887,7 +889,7 @@ class BaichuanModel(Model):
         else:
             raise ValueError("gguf: can not find ctx length parameter.")
 
-        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
         self.gguf_writer.add_source_hf_repo(hf_repo)
         self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
         self.gguf_writer.add_context_length(ctx_length)
@@ -1010,7 +1012,7 @@ class XverseModel(Model):
         else:
             raise ValueError("gguf: can not find ctx length parameter.")
 
-        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
         self.gguf_writer.add_source_hf_repo(hf_repo)
         self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
         self.gguf_writer.add_context_length(ctx_length)
@@ -1206,7 +1208,7 @@ class StableLMModel(Model):
         hparams = self.hparams
         block_count = hparams["num_hidden_layers"]
 
-        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
         self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
         self.gguf_writer.add_embedding_length(hparams["hidden_size"])
         self.gguf_writer.add_block_count(block_count)
@@ -1681,7 +1683,7 @@ class GPT2Model(Model):
     model_arch = gguf.MODEL_ARCH.GPT2
 
     def set_gguf_parameters(self):
-        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
         self.gguf_writer.add_block_count(self.hparams["n_layer"])
         self.gguf_writer.add_context_length(self.hparams["n_ctx"])
         self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
@@ -2248,7 +2250,7 @@ class GemmaModel(Model):
         hparams = self.hparams
         block_count = hparams["num_hidden_layers"]
 
-        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
         self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
         self.gguf_writer.add_embedding_length(hparams["hidden_size"])
         self.gguf_writer.add_block_count(block_count)
@@ -2348,7 +2350,7 @@ class MambaModel(Model):
         # Fail early for models which don't have a block expansion factor of 2
         assert d_inner == 2 * d_model
 
-        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
         self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
         self.gguf_writer.add_embedding_length(d_model)
         self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
@@ -2852,7 +2854,7 @@ def main() -> None:
             logger.error(f"Model {hparams['architectures'][0]} is not supported")
             sys.exit(1)
 
-        model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy)
+        model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy, args.model_name)
 
         logger.info("Set model parameters")
         model_instance.set_gguf_parameters()

From 42b53d192f4e3abf1b7c8e424628424504ea5dc5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Sun, 9 Jun 2024 09:42:25 +0200
Subject: [PATCH 35/39] CUDA: revise q8_1 data layout for mul_mat_q (#7824)

---
 ggml-cuda.cu           |  88 +++++++++------
 ggml-cuda/mmq.cu       |   3 +-
 ggml-cuda/mmq.cuh      | 236 ++++++++++++++++++++++-------------------
 ggml-cuda/quantize.cu  |  89 ++++++++++++++--
 ggml-cuda/quantize.cuh |  17 ++-
 5 files changed, 282 insertions(+), 151 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index dad8a9e2d..af10f21a0 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -1347,10 +1347,30 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
     GGML_UNUSED(main_device);
 }
 
+static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
+    void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {
+
+#if !defined(GGML_USE_HIPBLAS)
+    // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
+    cudaMemcpy3DPeerParms p = {};
+    p.dstDevice = dstDevice;
+    p.dstPtr = make_cudaPitchedPtr(dst, dpitch, dpitch, height);
+    p.srcDevice = srcDevice;
+    p.srcPtr = make_cudaPitchedPtr(src, spitch, spitch, height);
+    p.extent = make_cudaExtent(width, height, 1);
+    return cudaMemcpy3DPeerAsync(&p, stream);
+#else
+    // HIP does not support cudaMemcpy3DPeerAsync or vmm pools
+    GGML_UNUSED(dstDevice);
+    GGML_UNUSED(srcDevice);
+    return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream);
+#endif // !defined(GGML_USE_HIPBLAS)
+}
+
 static void ggml_cuda_op_mul_mat(
     ggml_backend_cuda_context & ctx,
     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
-    const bool convert_src1_to_q8_1) {
+    quantize_cuda_t quantize_src1) {
 
     const int64_t ne00 = src0->ne[0];
     const int64_t ne01 = src0->ne[1];
@@ -1407,7 +1427,9 @@ static void ggml_cuda_op_mul_mat(
     }
 
     struct dev_data {
-        ggml_cuda_pool_alloc<char>  src0_dd_alloc;
+        int cc;
+
+        ggml_cuda_pool_alloc<char>   src0_dd_alloc;
         ggml_cuda_pool_alloc<float> src1_ddf_alloc;
         ggml_cuda_pool_alloc<char>  src1_ddq_alloc;
         ggml_cuda_pool_alloc<float>   dst_dd_alloc;
@@ -1426,6 +1448,8 @@ static void ggml_cuda_op_mul_mat(
     int used_devices = 0;
 
     for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) {
+        dev[id].cc = ggml_cuda_info().devices[id].cc;
+
         // by default, use all rows
         dev[id].row_low  = 0;
         dev[id].row_high = ne01;
@@ -1476,11 +1500,15 @@ static void ggml_cuda_op_mul_mat(
             dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ctx.pool(id), ggml_nelements(src1));
         }
 
-        if (convert_src1_to_q8_1) {
-            dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);
+        if (quantize_src1) {
+            size_t src_1_ddq_size = nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs;
+            if (quantize_src1 == quantize_mmq_q8_1_cuda) {
+                src_1_ddq_size += get_mmq_x_max_host(dev[id].cc)*sizeof(block_q8_1_mmq);
+            }
+            dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), src_1_ddq_size);
 
             if (src1_on_device && src1_is_contiguous) {
-                quantize_row_q8_1_cuda(dev[id].src1_ddf, dev[id].src1_ddq, ne10, nrows1, src1_padded_col_size, stream);
+                quantize_src1(dev[id].src1_ddf, dev[id].src1_ddq, ne10, ne11, ne12*ne13, src1_padded_col_size, src0->type, stream);
                 CUDA_CHECK(cudaGetLastError());
             }
         }
@@ -1526,7 +1554,12 @@ static void ggml_cuda_op_mul_mat(
                 const int64_t i03 = i0 / ne12;
                 const int64_t i02 = i0 % ne12;
 
-                const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
+                size_t src1_ddq_i_offset = i0*ne11 * src1_padded_col_size*q8_1_ts/q8_1_bs;
+                if (quantize_src1 == quantize_mmq_q8_1_cuda) {
+                    src1_ddq_i_offset += src1_col_0 * sizeof(block_q8_1_mmq);
+                } else {
+                    src1_ddq_i_offset += src1_col_0 * src1_padded_col_size*q8_1_ts/q8_1_bs;
+                }
 
                 // for split tensors the data begins at i0 == i0_offset_low
                 char  *  src0_dd_i =  dev[id].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
@@ -1543,10 +1576,17 @@ static void ggml_cuda_op_mul_mat(
                 // copy src0, src1 to device if necessary
                 if (src1_is_contiguous) {
                     if (id != ctx.device) {
-                        if (convert_src1_to_q8_1) {
+                        if (quantize_src1) {
                             char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset;
-                            CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddq_i, id, src1_ddq_i_source, ctx.device,
-                                                            src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
+                            if (quantize_src1 == quantize_mmq_q8_1_cuda) {
+                                const size_t pitch = ne11*sizeof(block_q8_1_mmq);
+                                const size_t width = src1_ncols*sizeof(block_q8_1_mmq);
+                                const size_t height = src1_padded_col_size/(4*QK8_1);
+                                CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync(src1_ddq_i, id, pitch, src1_ddq_i_source, ctx.device, pitch, width, height, stream));
+                            } else {
+                                CUDA_CHECK(cudaMemcpyPeerAsync(
+                                    src1_ddq_i, id, src1_ddq_i_source, ctx.device, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream));
+                            }
                         } else {
                             float * src1_ddf_i_source = (float *) src1->data;
                             src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
@@ -1561,8 +1601,8 @@ static void ggml_cuda_op_mul_mat(
                     GGML_ASSERT(false);
                 }
 
-                if (convert_src1_to_q8_1 && !src1_is_contiguous) {
-                    quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
+                if (quantize_src1 && !src1_is_contiguous) {
+                    quantize_src1(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, 1, src1_padded_col_size, src0->type, stream);
                     CUDA_CHECK(cudaGetLastError());
                 }
 
@@ -1587,22 +1627,8 @@ static void ggml_cuda_op_mul_mat(
                         float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
                         GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
                         dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
-#if !defined(GGML_USE_HIPBLAS)
-                        // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
-                        cudaMemcpy3DPeerParms p = {};
-                        p.dstDevice = ctx.device;
-                        p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), row_diff, src1_ncols);
-                        p.srcDevice = id;
-                        p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols);
-                        p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1);
-                        CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream));
-#else
-                        // HIP does not support cudaMemcpy3DPeerAsync or vmm pools
-                        CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float),
-                                                        dst_dd_i, row_diff*sizeof(float),
-                                                        row_diff*sizeof(float), src1_ncols,
-                                                        cudaMemcpyDeviceToDevice, stream));
-#endif
+                        CUDA_CHECK(ggml_cuda_Memcpy2DPeerAsync(
+                            dhf_dst_i, ctx.device, ne0*sizeof(float), dst_dd_i, id, row_diff*sizeof(float), row_diff*sizeof(float), src1_ncols, stream));
                     } else {
                         float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
                         GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
@@ -1941,13 +1967,13 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
         // KQ + KQV multi-batch
         ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
     } else if (use_dequantize_mul_mat_vec) {
-        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
+        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, nullptr);
     } else if (use_mul_mat_vec_q) {
-        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
+        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
     } else if (use_mul_mat_q) {
-        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
+        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, quantize_mmq_q8_1_cuda);
     } else {
-        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
+        ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, nullptr);
     }
 }
 
diff --git a/ggml-cuda/mmq.cu b/ggml-cuda/mmq.cu
index 58799e4ca..1d6b9e698 100644
--- a/ggml-cuda/mmq.cu
+++ b/ggml-cuda/mmq.cu
@@ -11,6 +11,7 @@ void ggml_cuda_op_mul_mat_q(
     const int64_t nb01 = src0->nb[1];
 
     const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
     GGML_ASSERT(ne10 % QK8_1 == 0);
 
     const int64_t ne0 = dst->ne[0];
@@ -25,7 +26,7 @@ void ggml_cuda_op_mul_mat_q(
     // nrows_dst == nrows of the matrix that the kernel writes into
     const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
 
-    const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, nrows_dst};
+    const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst};
 
     switch (src0->type) {
         case GGML_TYPE_Q4_0:
diff --git a/ggml-cuda/mmq.cuh b/ggml-cuda/mmq.cuh
index 6744cce6d..3ccae8a0c 100644
--- a/ggml-cuda/mmq.cuh
+++ b/ggml-cuda/mmq.cuh
@@ -1,15 +1,26 @@
+#pragma once
+
 #include "common.cuh"
 #include "vecdotq.cuh"
 
 #include <climits>
 #include <cstdint>
 
+#define MMQ_TILE_Y_K (WARP_SIZE + WARP_SIZE/QI8_1)
+
 typedef void (*load_tiles_mmq_t)(
     const char * __restrict__ x, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
     int * __restrict__ x_sc, const int & kbx0, const int & i_max, const int & stride);
 typedef void (*vec_dot_mmq_t)(
     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, float * __restrict__ sum, const int & k0);
+    const int * __restrict__ y, float * __restrict__ sum, const int & k0);
+
+struct block_q8_1_mmq {
+    half2  ds[4];
+    int8_t qs[4*QK8_1];
+};
+static_assert(sizeof(block_q8_1_mmq) == 4*QK8_1 + 4*sizeof(half2), "Unexpected block_q8_1_mmq size");
+static_assert(sizeof(block_q8_1_mmq) == 4*sizeof(block_q8_1),      "Unexpected block_q8_1_mmq size");
 
 struct tile_x_sizes {
     int ql;
@@ -132,10 +143,14 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 template <int mmq_x, int mmq_y, int nwarps>
 static __device__ __forceinline__ void vec_dot_q4_0_q8_1_mul_mat(
     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
+    const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
 
     GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
 
+    const float * x_dmf = (const float *) x_dm;
+    const int   * y_qs  = (const int   *) y + 4;
+    const half2 * y_ds  = (const half2 *) y;
+
 #pragma unroll
     for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
         const int j = j0 + threadIdx.y;
@@ -145,19 +160,18 @@ static __device__ __forceinline__ void vec_dot_q4_0_q8_1_mul_mat(
             const int i = i0 + threadIdx.x;
 
             const int kyqs = k0 % (QI8_1/2) + QI8_1 * (k0 / (QI8_1/2));
-            const float * x_dmf = (const float *) x_dm;
 
             int u[2*VDR_Q4_0_Q8_1_MMQ];
 
 #pragma unroll
             for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
-                u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
-                u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
+                u[2*l+0] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l)         % WARP_SIZE];
+                u[2*l+1] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l + QI4_0) % WARP_SIZE];
             }
 
             sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
-                (&x_ql[i * (WARP_SIZE + 1) + k0], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k0/QI4_0],
-                y_ds[j * (WARP_SIZE/QI8_1) + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
+                (&x_ql[i*(WARP_SIZE + 1) + k0], u, x_dmf[i*(WARP_SIZE/QI4_0) + i/QI4_0 + k0/QI4_0],
+                y_ds[j*MMQ_TILE_Y_K + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
         }
     }
 }
@@ -203,10 +217,13 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 template <int mmq_x, int mmq_y, int nwarps>
 static __device__ __forceinline__ void vec_dot_q4_1_q8_1_mul_mat(
     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
+    const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
 
     GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
 
+    const int   * y_qs = (const int   *) y + 4;
+    const half2 * y_ds = (const half2 *) y;
+
 #pragma unroll
     for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
         const int j = j0 + threadIdx.y;
@@ -221,13 +238,13 @@ static __device__ __forceinline__ void vec_dot_q4_1_q8_1_mul_mat(
 
 #pragma unroll
             for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
-                u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
-                u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
+                u[2*l+0] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l)         % WARP_SIZE];
+                u[2*l+1] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l + QI4_1) % WARP_SIZE];
             }
 
             sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
-                (&x_ql[i * (WARP_SIZE + 1) + k0], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k0/QI4_1],
-                y_ds[j * (WARP_SIZE/QI8_1) + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
+                (&x_ql[i*(WARP_SIZE + 1) + k0], u, x_dm[i*(WARP_SIZE/QI4_1) + i/QI4_1 + k0/QI4_1],
+                y_ds[j*MMQ_TILE_Y_K + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
         }
     }
 }
@@ -293,10 +310,14 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 template <int mmq_x, int mmq_y, int nwarps>
 static __device__ __forceinline__ void vec_dot_q5_0_q8_1_mul_mat(
     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
+    const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
 
     GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
 
+    const float * x_dmf = (const float *) x_dm;
+    const int   * y_qs  = (const int   *) y + 4;
+    const float * y_df  = (const float *) y;
+
 #pragma unroll
     for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
         const int j = j0 + threadIdx.y;
@@ -306,20 +327,18 @@ static __device__ __forceinline__ void vec_dot_q5_0_q8_1_mul_mat(
             const int i = i0 + threadIdx.x;
 
             const int kyqs = k0 % (QI8_1/2) + QI8_1 * (k0 / (QI8_1/2));
-            const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k0/QI5_0;
-            const float * x_dmf = (const float *) x_dm;
-            const float * y_df  = (const float *) y_ds;
+            const int index_bx = i*(WARP_SIZE/QI5_0) + i/QI5_0 + k0/QI5_0;
 
             int u[2*VDR_Q5_0_Q8_1_MMQ];
 
 #pragma unroll
             for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
-                u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
-                u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
+                u[2*l+0] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l)         % WARP_SIZE];
+                u[2*l+1] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l + QI5_0) % WARP_SIZE];
             }
 
             sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q8_0_q8_1_impl<float, QR5_0*VDR_Q5_0_Q8_1_MMQ>
-                (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k0], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
+                (&x_ql[i*(2*WARP_SIZE + 1) + 2*k0], u, x_dmf[index_bx], y_df[j*MMQ_TILE_Y_K + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
         }
     }
 }
@@ -383,10 +402,13 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 template <int mmq_x, int mmq_y, int nwarps>
 static __device__ __forceinline__ void vec_dot_q5_1_q8_1_mul_mat(
     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
+    const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
 
     GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
 
+    const int   * y_qs  = (const int   *) y + 4;
+    const half2 * y_ds  = (const half2 *) y;
+
 #pragma unroll
     for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
         const int j = j0 + threadIdx.y;
@@ -396,18 +418,18 @@ static __device__ __forceinline__ void vec_dot_q5_1_q8_1_mul_mat(
             const int i = i0 + threadIdx.x;
 
             const int kyqs = k0 % (QI8_1/2) + QI8_1 * (k0 / (QI8_1/2));
-            const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k0/QI5_1;
+            const int index_bx = i*(WARP_SIZE/QI5_1) + i/QI5_1 + k0/QI5_1;
 
             int u[2*VDR_Q5_1_Q8_1_MMQ];
 
 #pragma unroll
             for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
-                u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
-                u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
+                u[2*l+0] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l)         % WARP_SIZE];
+                u[2*l+1] = y_qs[j*MMQ_TILE_Y_K + (kyqs + l + QI5_1) % WARP_SIZE];
             }
 
             sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
-                (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k0], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
+                (&x_ql[i*(2*WARP_SIZE + 1) + 2*k0], u, x_dm[index_bx], y_ds[j*MMQ_TILE_Y_K + (2*k0/QI8_1) % (WARP_SIZE/QI8_1)]);
         }
     }
 }
@@ -455,10 +477,14 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 template <int mmq_x, int mmq_y, int nwarps>
 static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mul_mat(
     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
+    const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
 
     GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
 
+    const float * x_dmf = (const float *) x_dm;
+    const int   * y_qs  = (const int   *) y + 4;
+    const float * y_df  = (const float *) y;
+
 #pragma unroll
     for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
         const int j = j0 + threadIdx.y;
@@ -467,12 +493,9 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mul_mat(
         for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
             const int i = i0 + threadIdx.x;
 
-            const float * x_dmf = (const float *) x_dm;
-            const float * y_df  = (const float *) y_ds;
-
             sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q8_0_q8_1_impl<float, VDR_Q8_0_Q8_1_MMQ>
-                (&x_ql[i * (WARP_SIZE + 1) + k0], &y_qs[j * WARP_SIZE + k0], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k0/QI8_0],
-                y_df[j * (WARP_SIZE/QI8_1) + k0/QI8_1]);
+                (&x_ql[i*(WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k0], x_dmf[i*(WARP_SIZE/QI8_0) + i/QI8_0 + k0/QI8_0],
+                y_df[j*MMQ_TILE_Y_K + k0/QI8_1]);
         }
     }
 }
@@ -531,10 +554,13 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 template <int mmq_x, int mmq_y, int nwarps>
 static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mul_mat(
     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
+    const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
 
     GGML_UNUSED(x_qh);
 
+    const int   * y_qs  = (const int   *) y + 4;
+    const float * y_df  = (const float *) y;
+
 #pragma unroll
     for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
         const int j = j0 + threadIdx.y;
@@ -545,11 +571,10 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mul_mat(
 
             const int kbx = k0 / QI2_K;
             const int ky  = (k0 % QI2_K) * QR2_K;
-            const float * y_df = (const float *) y_ds;
 
             int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
 
-            const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
+            const int kqsx = i*(WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
             const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
 
 #pragma unroll
@@ -557,11 +582,11 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mul_mat(
                 v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
             }
 
-            const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
+            const uint8_t * scales = ((const uint8_t *) &x_sc[i*(WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
 
-            const int index_y = j * WARP_SIZE + (QR2_K*k0) % WARP_SIZE;
             sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q2_K_q8_1_impl_mmq(
-                v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
+                v, &y_qs[j*MMQ_TILE_Y_K + (QR2_K*k0) % WARP_SIZE], scales,
+                x_dm[i*(WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[j*MMQ_TILE_Y_K + ((QR2_K*k0) % WARP_SIZE)/QI8_1]);
         }
     }
 }
@@ -646,7 +671,11 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 template <int mmq_x, int mmq_y, int nwarps>
 static __device__ __forceinline__ void vec_dot_q3_K_q8_1_mul_mat(
     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
+    const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
+
+    const float * x_dmf = (const float *) x_dm;
+    const int   * y_qs  = (const int   *) y + 4;
+    const float * y_df  = (const float *) y;
 
 #pragma unroll
     for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
@@ -658,8 +687,6 @@ static __device__ __forceinline__ void vec_dot_q3_K_q8_1_mul_mat(
 
             const int kbx  = k0 / QI3_K;
             const int ky  = (k0 % QI3_K) * QR3_K;
-            const float * x_dmf = (const float *) x_dm;
-            const float * y_df  = (const float *) y_ds;
 
             const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
 
@@ -667,19 +694,19 @@ static __device__ __forceinline__ void vec_dot_q3_K_q8_1_mul_mat(
 
 #pragma unroll
             for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
-                const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
+                const int kqsx = i*(WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
                 const int shift = 2 * ((ky % 32) / 8);
                 const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
 
-                const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
+                const int vh = x_qh[i*(WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
                 const int vlh = (vh << 2) & 0x04040404;
 
                 v[l] = __vsubss4(vll, vlh);
             }
 
-            const int index_y = j * WARP_SIZE + (k0*QR3_K) % WARP_SIZE;
             sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q3_K_q8_1_impl_mmq(
-                v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
+                v, &y_qs[j*MMQ_TILE_Y_K + (k0*QR3_K) % WARP_SIZE], scales,
+                x_dmf[i*(WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[j*MMQ_TILE_Y_K + ((k0*QR3_K) % WARP_SIZE)/QI8_1]);
         }
     }
 }
@@ -746,10 +773,13 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 template <int mmq_x, int mmq_y, int nwarps>
 static __device__ __forceinline__ void vec_dot_q4_K_q8_1_mul_mat(
     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
+    const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
 
     GGML_UNUSED(x_qh);
 
+    const int   * y_qs = (const int   *) y + 4;
+    const half2 * y_ds = (const half2 *) y;
+
 #pragma unroll
     for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
         const int j = j0 + threadIdx.y;
@@ -760,9 +790,9 @@ static __device__ __forceinline__ void vec_dot_q4_K_q8_1_mul_mat(
 
             const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/16]) + 2*((k0 % 16) / 8);
 
-            const int index_y = j * WARP_SIZE + (QR4_K*k0) % WARP_SIZE;
             sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q4_K_q8_1_impl_mmq(
-                &x_ql[i * (WARP_SIZE + 1) + k0], &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
+                &x_ql[i*(WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + (QR4_K*k0) % WARP_SIZE], sc, sc+8,
+                x_dm[i*(WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[j*MMQ_TILE_Y_K + ((QR4_K*k0) % WARP_SIZE)/QI8_1]);
         }
     }
 }
@@ -842,10 +872,13 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 template <int mmq_x, int mmq_y, int nwarps>
 static __device__ __forceinline__ void vec_dot_q5_K_q8_1_mul_mat(
     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
+    const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
 
     GGML_UNUSED(x_qh);
 
+    const int   * y_qs  = (const int   *) y + 4;
+    const half2 * y_ds  = (const half2 *) y;
+
 #pragma unroll
     for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
         const int j = j0 + threadIdx.y;
@@ -856,10 +889,9 @@ static __device__ __forceinline__ void vec_dot_q5_K_q8_1_mul_mat(
 
             const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/16]) + 2 * ((k0 % 16) / 8);
 
-            const int index_x = i * (QR5_K*WARP_SIZE + 1) +  QR5_K*k0;
-            const int index_y = j * WARP_SIZE             + (QR5_K*k0) % WARP_SIZE;
             sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q5_K_q8_1_impl_mmq(
-                &x_ql[index_x], &y_qs[index_y], sc, sc+8, x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
+                &x_ql[i*(QR5_K*WARP_SIZE + 1) + QR5_K*k0], &y_qs[j*MMQ_TILE_Y_K + (QR5_K*k0) % WARP_SIZE], sc, sc+8,
+                x_dm[i*(WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[j*MMQ_TILE_Y_K + ((QR5_K*k0) % WARP_SIZE)/QI8_1]);
         }
     }
 }
@@ -932,10 +964,14 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 template <int mmq_x, int mmq_y, int nwarps>
 static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mul_mat(
     const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
-    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, float * __restrict__ sum, const int & k0) {
+    const int * __restrict__ y, float * __restrict__ sum, const int & k0) {
 
     GGML_UNUSED(x_qh);
 
+    const float * x_dmf = (const float *) x_dm;
+    const int   * y_qs  = (const int   *) y + 4;
+    const float * y_df  = (const float *) y;
+
 #pragma unroll
     for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
         const int j = j0 + threadIdx.y;
@@ -944,15 +980,11 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mul_mat(
         for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
             const int i = i0 + threadIdx.x;
 
-            const float * x_dmf = (const float *) x_dm;
-            const float * y_df  = (const float *) y_ds;
-
             const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k0/8]);
 
-            const int index_x = i * (QR6_K*WARP_SIZE + 1) +  QR6_K*k0;
-            const int index_y = j * WARP_SIZE             + (QR6_K*k0) % WARP_SIZE;
             sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q6_K_q8_1_impl_mmq(
-                &x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
+                &x_ql[i*(QR6_K*WARP_SIZE + 1) + QR6_K*k0], &y_qs[j*MMQ_TILE_Y_K + (QR6_K*k0) % WARP_SIZE], sc,
+                x_dmf[i*(WARP_SIZE/QI6_K) + i/QI6_K], &y_df[j*MMQ_TILE_Y_K + ((QR6_K*k0) % WARP_SIZE)/QI8_1]);
         }
     }
 }
@@ -964,7 +996,6 @@ struct mmq_type_traits;
 
 template <int mmq_x, int mmq_y, int nwarps, bool need_check>
 struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_0> {
-    static constexpr bool             need_sum   = true;
     static constexpr int              vdr        = VDR_Q4_0_Q8_1_MMQ;
     static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_0<mmq_y, nwarps, need_check>;
     static constexpr vec_dot_mmq_t    vec_dot    = vec_dot_q4_0_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
@@ -972,7 +1003,6 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_0> {
 
 template <int mmq_x, int mmq_y, int nwarps, bool need_check>
 struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_1> {
-    static constexpr bool             need_sum   = true;
     static constexpr int              vdr        = VDR_Q4_1_Q8_1_MMQ;
     static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_1<mmq_y, nwarps, need_check>;
     static constexpr vec_dot_mmq_t    vec_dot    = vec_dot_q4_1_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
@@ -980,7 +1010,6 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_1> {
 
 template <int mmq_x, int mmq_y, int nwarps, bool need_check>
 struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_0> {
-    static constexpr bool             need_sum   = false;
     static constexpr int              vdr        = VDR_Q5_0_Q8_1_MMQ;
     static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_0<mmq_y, nwarps, need_check>;
     static constexpr vec_dot_mmq_t    vec_dot    = vec_dot_q5_0_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
@@ -988,7 +1017,6 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_0> {
 
 template <int mmq_x, int mmq_y, int nwarps, bool need_check>
 struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_1> {
-    static constexpr bool             need_sum   = true;
     static constexpr int              vdr        = VDR_Q5_1_Q8_1_MMQ;
     static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_1<mmq_y, nwarps, need_check>;
     static constexpr vec_dot_mmq_t    vec_dot    = vec_dot_q5_1_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
@@ -996,7 +1024,6 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_1> {
 
 template <int mmq_x, int mmq_y, int nwarps, bool need_check>
 struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q8_0> {
-    static constexpr bool             need_sum   = false;
     static constexpr int              vdr        = VDR_Q8_0_Q8_1_MMQ;
     static constexpr load_tiles_mmq_t load_tiles = load_tiles_q8_0<mmq_y, nwarps, need_check>;
     static constexpr vec_dot_mmq_t    vec_dot    = vec_dot_q8_0_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
@@ -1004,7 +1031,6 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q8_0> {
 
 template <int mmq_x, int mmq_y, int nwarps, bool need_check>
 struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q2_K> {
-    static constexpr bool             need_sum   = false;
     static constexpr int              vdr        = VDR_Q2_K_Q8_1_MMQ;
     static constexpr load_tiles_mmq_t load_tiles = load_tiles_q2_K<mmq_y, nwarps, need_check>;
     static constexpr vec_dot_mmq_t    vec_dot    = vec_dot_q2_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
@@ -1012,7 +1038,6 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q2_K> {
 
 template <int mmq_x, int mmq_y, int nwarps, bool need_check>
 struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q3_K> {
-    static constexpr bool             need_sum   = false;
     static constexpr int              vdr        = VDR_Q3_K_Q8_1_MMQ;
     static constexpr load_tiles_mmq_t load_tiles = load_tiles_q3_K<mmq_y, nwarps, need_check>;
     static constexpr vec_dot_mmq_t    vec_dot    = vec_dot_q3_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
@@ -1020,7 +1045,6 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q3_K> {
 
 template <int mmq_x, int mmq_y, int nwarps, bool need_check>
 struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_K> {
-    static constexpr bool             need_sum   = true;
     static constexpr int              vdr        = VDR_Q4_K_Q8_1_MMQ;
     static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_K<mmq_y, nwarps, need_check>;
     static constexpr vec_dot_mmq_t    vec_dot    = vec_dot_q4_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
@@ -1028,7 +1052,6 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q4_K> {
 
 template <int mmq_x, int mmq_y, int nwarps, bool need_check>
 struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_K> {
-    static constexpr bool             need_sum   = true;
     static constexpr int              vdr        = VDR_Q5_K_Q8_1_MMQ;
     static constexpr load_tiles_mmq_t load_tiles = load_tiles_q5_K<mmq_y, nwarps, need_check>;
     static constexpr vec_dot_mmq_t    vec_dot    = vec_dot_q5_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
@@ -1036,12 +1059,36 @@ struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q5_K> {
 
 template <int mmq_x, int mmq_y, int nwarps, bool need_check>
 struct mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, GGML_TYPE_Q6_K> {
-    static constexpr bool             need_sum   = false;
     static constexpr int              vdr        = VDR_Q6_K_Q8_1_MMQ;
     static constexpr load_tiles_mmq_t load_tiles = load_tiles_q6_K<mmq_y, nwarps, need_check>;
     static constexpr vec_dot_mmq_t    vec_dot    = vec_dot_q6_K_q8_1_mul_mat<mmq_x, mmq_y, nwarps>;
 };
 
+static int mmq_need_sum(const ggml_type type_x) {
+    switch (type_x) {
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+            return true;
+        case GGML_TYPE_Q5_0:
+            return false;
+        case GGML_TYPE_Q5_1:
+            return true;
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+            return false;
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+            return true;
+        case GGML_TYPE_Q6_K:
+            return false;
+        default:
+            GGML_ASSERT(false);
+            break;
+    }
+    return false;
+}
+
 template <ggml_type type, int mmq_x, int nwarps, bool need_check>
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 #if defined(RDNA3) || defined(RDNA2)
@@ -1056,7 +1103,7 @@ template <ggml_type type, int mmq_x, int nwarps, bool need_check>
 #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 static __global__ void mul_mat_q(
     const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst,
-    const int ne00, const int ne01, const int stride00, const int ne10, const int ne11, const int ne0) {
+    const int ne00, const int ne01, const int stride01, const int ne10, const int ne11, const int stride11, const int ne0) {
 
     // Skip unused template specializations for faster compilation:
     if (mmq_x > get_mmq_x_max_device()) {
@@ -1068,7 +1115,6 @@ static __global__ void mul_mat_q(
     constexpr int              qr         = ggml_cuda_type_traits<type>::qr;
     constexpr int              qi         = ggml_cuda_type_traits<type>::qi;
     constexpr int              mmq_y      = get_mmq_y_device(mmq_x);
-    constexpr bool             need_sum   = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::need_sum;
     constexpr int              vdr        = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::vdr;
     constexpr load_tiles_mmq_t load_tiles = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::load_tiles;
     constexpr vec_dot_mmq_t    vec_dot    = mmq_type_traits<mmq_x, mmq_y, nwarps, need_check, type>::vec_dot;
@@ -1080,62 +1126,38 @@ static __global__ void mul_mat_q(
     half2 * tile_x_dm = (half2 *) (tile_x_ql + txs.ql);
     int   * tile_x_qh = (int   *) (tile_x_dm + txs.dm);
     int   * tile_x_sc = (int   *) (tile_x_qh + txs.qh);
-    int   * tile_y_qs = (int   *) (tile_x_sc + txs.sc);          // [mmq_x * WARP_SIZE]
-    half2 * tile_y_ds = (half2 *) (tile_y_qs + mmq_x*WARP_SIZE); // [mmq_x * WARP_SIZE/QI8_1];
-
-    const block_q8_1 * y = (const block_q8_1 *) yc;
+    int   * tile_y    = (int   *) (tile_x_sc + txs.sc); // [mmq_x * (WARP_SIZE + WARP_SIZE/QI8_1)]
 
     const int blocks_per_row_x = ne00 / qk;
-    const int blocks_per_col_y = ne10 / QK8_1;
     const int blocks_per_warp = WARP_SIZE / qi;
 
     const int & ne1 = ne11;
 
     const int tile_x_max_i = ne01 - blockIdx.x*mmq_y - 1;
 
+    const int * y = (const int *) yc + blockIdx.y*(mmq_x*sizeof(block_q8_1_mmq)/sizeof(int));
+
     float sum[(mmq_x/nwarps) * (mmq_y/WARP_SIZE)] = {0.0f};
 
     for (int kb0 = 0; kb0 < blocks_per_row_x; kb0 += blocks_per_warp) {
 
-        load_tiles(x, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, stride00*blockIdx.x*mmq_y + kb0, tile_x_max_i, stride00);
+        load_tiles(x, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, stride01*blockIdx.x*mmq_y + kb0, tile_x_max_i, stride01);
 
 #pragma unroll
         for (int kr = 0; kr < qr; ++kr) {
-            const int kqs = kr*WARP_SIZE + threadIdx.x;
-            const int kbxd = kqs / QI8_1;
-
+            const int * by0 = y + stride11*(kb0*(qk*sizeof(block_q8_1_mmq) / (4*QK8_1*sizeof(int))) + kr*sizeof(block_q8_1_mmq)/sizeof(int));
 #pragma unroll
-            for (int i0 = 0; i0 < mmq_x; i0 += nwarps) {
-                const int i = min(blockIdx.y*mmq_x + threadIdx.y + i0, ne11-1); // to prevent out-of-bounds memory accesses
+            for (int l0 = 0; l0 < mmq_x*MMQ_TILE_Y_K; l0 += nwarps*WARP_SIZE) {
+                int l = l0 + threadIdx.y*WARP_SIZE + threadIdx.x;
 
-                const block_q8_1 * by0 = &y[i*blocks_per_col_y + kb0 * (qk/QK8_1) + kbxd];
-
-                const int index_y = (i0 + threadIdx.y) * WARP_SIZE + kqs % WARP_SIZE;
-                tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
-            }
-
-#pragma unroll
-            for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
-                const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x;
-                const int kby = threadIdx.x % (WARP_SIZE/QI8_1);
-                const int i_y_eff = min(blockIdx.y*mmq_x + ids, ne11-1);
-
-                // if the sum is not needed it's faster to transform the scale to f32 ahead of time
-                const half2 * dsi_src = &y[i_y_eff*blocks_per_col_y + kb0 * (qk/QK8_1) + kr*(WARP_SIZE/QI8_1) + kby].ds;
-                half2       * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby];
-                if (need_sum) {
-                    *dsi_dst = *dsi_src;
-                } else {
-                    float * dfi_dst = (float *) dsi_dst;
-                    *dfi_dst = __low2float(*dsi_src);
-                }
+                tile_y[l] = by0[l];
             }
 
             __syncthreads();
 
 // #pragma unroll // unrolling this loop causes too much register pressure
             for (int k0 = kr*WARP_SIZE/qr; k0 < (kr+1)*WARP_SIZE/qr; k0 += vdr) {
-                vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds, sum, k0);
+                vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y, sum, k0);
             }
 
             __syncthreads();
@@ -1165,8 +1187,8 @@ static __global__ void mul_mat_q(
 
 struct mmq_args {
     const char * x; const char * y; float * dst;
-    int64_t ne00; int64_t ne01; int64_t stride00;
-    int64_t ne10; int64_t ne11;
+    int64_t ne00; int64_t ne01; int64_t stride01;
+    int64_t ne10; int64_t ne11; int64_t stride11;
     int64_t ne0;
 };
 
@@ -1184,7 +1206,7 @@ static void launch_mul_mat_q(const mmq_args & args, cudaStream_t stream) {
     const tile_x_sizes txs = get_tile_x_sizes_host(type, mmq_y);
     const int shmem_x = txs.ql*sizeof(int) + txs.dm*sizeof(half2) + txs.qh*sizeof(int) + txs.sc*sizeof(int);
     const int shmem_y = mmq_x*WARP_SIZE*sizeof(int) + mmq_x*(WARP_SIZE/QI8_1)*sizeof(half2);
-    const int shmem = shmem_x + shmem_y;
+    const int shmem = shmem_x + GGML_PAD(shmem_y, nwarps*WARP_SIZE*sizeof(int));
 
 #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
     static bool shmem_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
@@ -1198,11 +1220,11 @@ static void launch_mul_mat_q(const mmq_args & args, cudaStream_t stream) {
     if (args.ne01 % mmq_y == 0) {
         const bool need_check = false;
         mul_mat_q<type, mmq_x, nwarps, need_check><<<block_nums, block_dims, shmem, stream>>>
-            (args.x, args.y, args.dst, args.ne00, args.ne01, args.stride00, args.ne10, args.ne11, args.ne0);
+            (args.x, args.y, args.dst, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0);
     } else {
         const bool need_check = true;
         mul_mat_q<type, mmq_x, nwarps, need_check><<<block_nums, block_dims, shmem, stream>>>
-            (args.x, args.y, args.dst, args.ne00, args.ne01, args.stride00, args.ne10, args.ne11, args.ne0);
+            (args.x, args.y, args.dst, args.ne00, args.ne01, args.stride01, args.ne10, args.ne11, args.stride11, args.ne0);
     }
 }
 
diff --git a/ggml-cuda/quantize.cu b/ggml-cuda/quantize.cu
index 7578c4b6c..b46786822 100644
--- a/ggml-cuda/quantize.cu
+++ b/ggml-cuda/quantize.cu
@@ -1,22 +1,23 @@
 #include "quantize.cuh"
+#include <cstdint>
 
-static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx_padded) {
-    const int64_t ix = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
+static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx0_padded) {
+    const int64_t ix0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
 
-    if (ix >= kx_padded) {
+    if (ix0 >= kx0_padded) {
         return;
     }
 
-    const int64_t iy = (int64_t)blockDim.y*blockIdx.y + threadIdx.y;
+    const int64_t ix1 = blockIdx.y;
 
-    const int64_t i_padded = (int64_t)iy*kx_padded + ix;
+    const int64_t i_padded = ix1*kx0_padded + ix0;
 
     block_q8_1 * y = (block_q8_1 *) vy;
 
     const int64_t ib = i_padded / QK8_1; // block index
     const int64_t iqs = i_padded % QK8_1; // quant index
 
-    const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
+    const float xi = ix0 < kx ? x[ix1*kx + ix0] : 0.0f;
     float amax = fabsf(xi);
     float sum = xi;
 
@@ -36,10 +37,76 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
     reinterpret_cast<half&>(y[ib].ds.y) = sum;
 }
 
-void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream) {
-    const int64_t block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
-    const dim3 num_blocks(block_num_x, ky, 1);
-    const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
-    quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
+template <bool need_sum>
+static __global__ void quantize_mmq_q8_1(
+    const float * __restrict__ x, void * __restrict__ vy, const int64_t kx0, const int64_t kx1, const int64_t kx0_padded) {
+
+    const int64_t ix0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
+
+    if (ix0 >= kx0_padded) {
+        return;
+    }
+
+    const int64_t ix1 = kx1*blockIdx.z + blockIdx.y;
+
+    block_q8_1_mmq * y = (block_q8_1_mmq *) vy;
+
+    const int64_t ib0 = blockIdx.z*(gridDim.y*gridDim.x*blockDim.x/(4*QK8_1)); // first block of channel
+    const int64_t ib  = ib0 + (ix0 / (4*QK8_1))*kx1 + blockIdx.y;              // block index in channel
+    const int64_t iqs = ix0 % (4*QK8_1);                                       // quant index in block
+
+    const float xi = ix0 < kx0 ? x[ix1*kx0 + ix0] : 0.0f;
+    float amax = fabsf(xi);
+
+    amax = warp_reduce_max(amax);
+
+    float sum;
+    if (need_sum) {
+        sum = warp_reduce_sum(xi);
+    }
+
+    const float d = amax / 127;
+    const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
+
+    y[ib].qs[iqs] = q;
+
+    if (iqs % QK8_1 != 0) {
+        return;
+    }
+
+    if (need_sum) {
+        y[ib].ds[iqs/QK8_1] = make_half2(d, sum);
+    } else {
+        ((float *) y[ib].ds)[iqs/QK8_1] = d;
+    }
 }
 
+void quantize_row_q8_1_cuda(
+    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels,
+    const int64_t kx0_padded, const ggml_type type_x, cudaStream_t stream) {
+
+    GGML_ASSERT(kx0_padded % QK8_1 == 0);
+
+    const int64_t block_num_x = (kx0_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
+    const dim3 num_blocks(block_num_x, kx1*channels, 1);
+    const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
+    quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx0_padded);
+
+    GGML_UNUSED(type_x);
+}
+
+void quantize_mmq_q8_1_cuda(
+    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels,
+    const int64_t kx0_padded, const ggml_type type_x, cudaStream_t stream) {
+
+    GGML_ASSERT(kx0_padded % (4*QK8_1) == 0);
+
+    const int64_t block_num_x = (kx0_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
+    const dim3 num_blocks(block_num_x, kx1, channels);
+    const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
+    if (mmq_need_sum(type_x)) {
+        quantize_mmq_q8_1<true><<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
+    } else {
+        quantize_mmq_q8_1<false><<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
+    }
+}
diff --git a/ggml-cuda/quantize.cuh b/ggml-cuda/quantize.cuh
index b37a4752f..486c9360a 100644
--- a/ggml-cuda/quantize.cuh
+++ b/ggml-cuda/quantize.cuh
@@ -1,5 +1,20 @@
+#pragma once
+
 #include "common.cuh"
+#include "mmq.cuh"
+
+#include <cstdint>
 
 #define CUDA_QUANTIZE_BLOCK_SIZE 256
 
-void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream);
+typedef void (*quantize_cuda_t)(
+    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
+    const ggml_type type_x, cudaStream_t stream);
+
+void quantize_row_q8_1_cuda(
+    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
+    const ggml_type type_x, cudaStream_t stream);
+
+void quantize_mmq_q8_1_cuda(
+    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
+    const ggml_type type_x, cudaStream_t stream);

From 3e2ee443159724e2d3a0741f6b167e599ec088aa Mon Sep 17 00:00:00 2001
From: mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com>
Date: Sun, 9 Jun 2024 12:50:35 +0200
Subject: [PATCH 36/39] server: do not remove whitespace at the start of a
 completion chunk (#7830)

---
 examples/server/public/index-new.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/server/public/index-new.html b/examples/server/public/index-new.html
index d571c2779..19c9f643d 100644
--- a/examples/server/public/index-new.html
+++ b/examples/server/public/index-new.html
@@ -416,7 +416,7 @@
           message = html`<${Probabilities} data=${data} />`
         } else {
           const text = isArrayMessage ?
-            data.map(msg => msg.content).join('').replace(/^\s+/, '') :
+            data.map(msg => msg.content).join('') :
             data;
           message = isCompletionMode ?
             text :

From 57bf62ce7cb75cca589943e2050d29bff4026e76 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=A1s=20P=C3=A9rez?= <nicolas_perez@brown.edu>
Date: Sun, 9 Jun 2024 11:24:29 -0400
Subject: [PATCH 37/39] docs: Added initial PR template with directions for doc
 only changes and squash merges [no ci] (#7700)

This commit adds pull_request_template.md and CONTRIBUTING.md . It focuses on explaining to contributors the need to rate PR complexity level, when to add [no ci] and how to format PR title and descriptions.

Co-authored-by: Brian <mofosyne@gmail.com>
Co-authored-by: compilade <git@compilade.net>
---
 .../PULL_REQUEST_TEMPLATE/pull_request_template.md |  5 +++++
 CONTRIBUTING.md                                    | 14 ++++++++++++++
 2 files changed, 19 insertions(+)
 create mode 100644 .github/PULL_REQUEST_TEMPLATE/pull_request_template.md
 create mode 100644 CONTRIBUTING.md

diff --git a/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md b/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md
new file mode 100644
index 000000000..0852fded5
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md
@@ -0,0 +1,5 @@
+- Self Reported Review Complexity:
+    - [ ] Review Complexity : Low
+    - [ ] Review Complexity : Medium
+    - [ ] Review Complexity : High
+- [ ] I have read the [contributing guidelines](CONTRIBUTING.md)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 000000000..991d85e49
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,14 @@
+# Contributing Guidelines
+
+## Checklist
+
+* Make sure your PR follows the [coding guidelines](https://github.com/ggerganov/llama.cpp/blob/master/README.md#coding-guidelines)
+* Test your changes using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
+* Execute [the full CI locally on your machine](ci/README.md) before publishing
+
+## PR formatting
+
+* Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
+    - The PR template has a series of review complexity checkboxes `[ ]` that you can mark as `[X]` for your conveience. Refer to [About task lists](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) for more information.
+* If the pull request only contains documentation changes (e.g., updating READMEs, adding new wiki pages), please add `[no ci]` to the commit title. This will skip unnecessary CI checks and help reduce build times.
+* When squashing multiple commits on merge, use the following format for your commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : Fix typo in utils.py (#1234)`

From e95beeb1fc4621826ddd616776dbdf717366bf5c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 9 Jun 2024 20:19:35 +0300
Subject: [PATCH 38/39] imatrix : handle partial entries (#7833)

---
 examples/imatrix/imatrix.cpp | 58 +++++++++++++++++++++++++++++++-----
 1 file changed, 51 insertions(+), 7 deletions(-)

diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index e18f49563..574f5ed9c 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -218,20 +218,64 @@ void IMatrixCollector::save_imatrix(int ncall) const {
         fname += std::to_string(ncall);
     }
 
+    // avoid writing imatrix entries that do not have full data
+    // this can happen with MoE models where some of the experts end up not being exercised by the provided training data
+
+    int n_entries = 0;
+    std::vector<std::string> to_store;
+
+    bool is_first = true; // for printing
+    for (const auto & kv : m_stats) {
+        const int n_all = kv.second.counts.size();
+
+        if (n_all == 0) {
+            continue;
+        }
+
+        int n_zeros = 0;
+        for (const int c : kv.second.counts) {
+            if (c == 0) {
+                n_zeros++;
+            }
+        }
+
+        if (n_zeros != 0 && is_first) {
+            fprintf(stderr, "\n");
+            is_first = false;
+        }
+
+        if (n_zeros == n_all) {
+            fprintf(stderr, "%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
+            continue;
+        }
+
+        if (n_zeros > 0) {
+            fprintf(stderr, "%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
+            continue;
+        }
+
+        n_entries++;
+        to_store.push_back(kv.first);
+    }
+
+    if (to_store.size() < m_stats.size()) {
+        fprintf(stderr, "%s: warning: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
+    }
+
     std::ofstream out(fname, std::ios::binary);
-    int n_entries = m_stats.size();
     out.write((const char *) &n_entries, sizeof(n_entries));
-    for (const auto & p : m_stats) {
-        int len = p.first.size();
+    for (const auto & name : to_store) {
+        const auto & stat = m_stats.at(name);
+        int len = name.size();
         out.write((const char *) &len, sizeof(len));
-        out.write(p.first.c_str(), len);
-        out.write((const char *) &p.second.ncall, sizeof(p.second.ncall));
-        int nval = p.second.values.size();
+        out.write(name.c_str(), len);
+        out.write((const char *) &stat.ncall, sizeof(stat.ncall));
+        int nval = stat.values.size();
         out.write((const char *) &nval, sizeof(nval));
         if (nval > 0) {
             std::vector<float> tmp(nval);
             for (int i = 0; i < nval; i++) {
-                tmp[i] = (p.second.values[i] / static_cast<float>(p.second.counts[i])) * static_cast<float>(p.second.ncall);
+                tmp[i] = (stat.values[i] / static_cast<float>(stat.counts[i])) * static_cast<float>(stat.ncall);
             }
             out.write((const char*)tmp.data(), nval*sizeof(float));
         }

From 10ceba354a3b152ff425e9fa97f9caaef99a46b1 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 10 Jun 2024 02:04:50 +0300
Subject: [PATCH 39/39] flake.lock: Update (#7838)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Flake lock file updates:

• Updated input 'nixpkgs':
    'github:NixOS/nixpkgs/ad57eef4ef0659193044870c731987a6df5cf56b?narHash=sha256-SzDKxseEcHR5KzPXLwsemyTR/kaM9whxeiJohbL04rs%3D' (2024-05-29)
  → 'github:NixOS/nixpkgs/051f920625ab5aabe37c920346e3e69d7d34400e?narHash=sha256-4q0s6m0GUcN7q%2BY2DqD27iLvbcd1G50T2lv08kKxkSI%3D' (2024-06-07)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 flake.lock | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/flake.lock b/flake.lock
index 09047ab10..7272e65fa 100644
--- a/flake.lock
+++ b/flake.lock
@@ -20,11 +20,11 @@
     },
     "nixpkgs": {
       "locked": {
-        "lastModified": 1716948383,
-        "narHash": "sha256-SzDKxseEcHR5KzPXLwsemyTR/kaM9whxeiJohbL04rs=",
+        "lastModified": 1717786204,
+        "narHash": "sha256-4q0s6m0GUcN7q+Y2DqD27iLvbcd1G50T2lv08kKxkSI=",
         "owner": "NixOS",
         "repo": "nixpkgs",
-        "rev": "ad57eef4ef0659193044870c731987a6df5cf56b",
+        "rev": "051f920625ab5aabe37c920346e3e69d7d34400e",
         "type": "github"
       },
       "original": {