Merge branch 'ggerganov:master' into master

2025-01-27 20:43:07 +01:00 · 2025-01-19 13:04:21 -07:00 · 2025-01-19 13:04:21 -07:00 · 161c25f776
commit 161c25f776
parent 059325d9b4 92bc493917
28 changed files with 3407 additions and 606 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -87,6 +87,7 @@ jobs:
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
          cp LICENSE ./build/bin/
+          cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip ./build/bin/*

      - name: Upload artifacts
@ -149,6 +150,7 @@ jobs:
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
          cp LICENSE ./build/bin/
+          cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
          zip -r llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip ./build/bin/*

      - name: Upload artifacts
@ -217,6 +219,7 @@ jobs:
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
          cp LICENSE ./build/bin/
+          cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
          zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/*

      - name: Upload artifacts
@ -234,7 +237,7 @@ jobs:
    strategy:
      matrix:
        sanitizer: [ADDRESS, THREAD, UNDEFINED]
-        build_type: [Debug, Release]
+        build_type: [Debug]

    steps:
      - name: Clone
@ -796,6 +799,7 @@ jobs:
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
          Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
+          Copy-Item .\examples\run\linenoise.cpp\LICENSE .\build\bin\Release\linenoise.cpp.txt
          7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*

      - name: Upload artifacts
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@ -112,9 +112,9 @@ jobs:
              -DGGML_OPENMP=OFF ;
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server

-      - name: Build
-        id: cmake_build
-        if: ${{ matrix.sanitizer != 'THREAD' }}
+      - name: Build (sanitizers)
+        id: cmake_build_sanitizers
+        if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
        run: |
          cmake -B build \
              -DGGML_NATIVE=OFF \
@ -124,12 +124,31 @@ jobs:
              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server

+      - name: Build (sanitizers)
+        id: cmake_build
+        if: ${{ matrix.sanitizer == '' }}
+        run: |
+          cmake -B build \
+              -DGGML_NATIVE=OFF \
+              -DLLAMA_BUILD_SERVER=ON \
+              -DLLAMA_CURL=ON \
+              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
+          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
+
      - name: Tests
        id: server_integration_tests
+        if: ${{ matrix.sanitizer == '' }}
        run: |
          cd examples/server/tests
          ./tests.sh

+      - name: Tests (sanitizers)
+        id: server_integration_tests_sanitizers
+        if: ${{ matrix.sanitizer != '' }}
+        run: |
+          cd examples/server/tests
+          LLAMA_SANITIZE=1 ./tests.sh
+
      - name: Slow tests
        id: server_integration_tests_slow
        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -83,11 +83,8 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/common.cmake)

 # override ggml options
-set(GGML_SANITIZE_THREAD    ${LLAMA_SANITIZE_THREAD})
-set(GGML_SANITIZE_ADDRESS   ${LLAMA_SANITIZE_ADDRESS})
-set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED})
-set(GGML_ALL_WARNINGS       ${LLAMA_ALL_WARNINGS})
-set(GGML_FATAL_WARNINGS     ${LLAMA_FATAL_WARNINGS})
+set(GGML_ALL_WARNINGS   ${LLAMA_ALL_WARNINGS})
+set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})

 # change the default for these ggml options
 if (NOT DEFINED GGML_LLAMAFILE)
@ -117,16 +114,62 @@ llama_option_depr(WARNING     LLAMA_SYCL                GGML_SYCL)
 llama_option_depr(WARNING     LLAMA_SYCL_F16            GGML_SYCL_F16)
 llama_option_depr(WARNING     LLAMA_CANN                GGML_CANN)

+if (NOT MSVC)
+    if (LLAMA_SANITIZE_THREAD)
+        message(STATUS "Using -fsanitize=thread")
+
+        add_compile_options(-fsanitize=thread)
+        link_libraries     (-fsanitize=thread)
+    endif()
+
+    if (LLAMA_SANITIZE_ADDRESS)
+        message(STATUS "Using -fsanitize=address")
+
+        add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
+        link_libraries     (-fsanitize=address)
+    endif()
+
+    if (LLAMA_SANITIZE_UNDEFINED)
+        message(STATUS "Using -fsanitize=undefined")
+
+        add_compile_options(-fsanitize=undefined)
+        link_libraries     (-fsanitize=undefined)
+    endif()
+endif()
+
 #
-# build the library
+# 3rd-party
 #

 if (NOT TARGET ggml)
    add_subdirectory(ggml)
    # ... otherwise assume ggml is added by a parent CMakeLists.txt
 endif()
+
+#
+# build the library
+#
+
 add_subdirectory(src)

+#
+# utils, programs, examples and tests
+#
+
+if (LLAMA_BUILD_COMMON)
+    add_subdirectory(common)
+endif()
+
+if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
+    include(CTest)
+    add_subdirectory(tests)
+endif()
+
+if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
+    add_subdirectory(examples)
+    add_subdirectory(pocs)
+endif()
+
 #
 # install
 #
@ -200,21 +243,3 @@ configure_file(cmake/llama.pc.in

 install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
        DESTINATION lib/pkgconfig)
-
-#
-# utils, programs, examples and tests
-#
-
-if (LLAMA_BUILD_COMMON)
-    add_subdirectory(common)
-endif()
-
-if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
-    include(CTest)
-    add_subdirectory(tests)
-endif()
-
-if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
-    add_subdirectory(examples)
-    add_subdirectory(pocs)
-endif()
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -2254,6 +2254,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.vocoder.model = value;
        }
    ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
+     add_opt(common_arg(
+        {"--tts-use-guide-tokens"},
+        "Use guide tokens to improve TTS word recall",
+        [](common_params & params) {
+            params.vocoder.use_guide_tokens = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));

    // model-specific
    add_opt(common_arg(
--- a/common/common.h
+++ b/common/common.h
@ -184,6 +184,8 @@ struct common_params_vocoder {

    std::string model     = ""; // model path                                                // NOLINT
    std::string model_url = ""; // model url to download                                     // NOLINT
+
+    bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy            // NOLINT
 };

 struct common_params {
--- a/examples/run/CMakeLists.txt
+++ b/examples/run/CMakeLists.txt
@ -1,5 +1,5 @@
 set(TARGET llama-run)
-add_executable(${TARGET} run.cpp)
+add_executable(${TARGET} run.cpp linenoise.cpp/linenoise.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/run/linenoise.cpp/LICENSE
+++ b/examples/run/linenoise.cpp/LICENSE
@ -0,0 +1,26 @@
+Copyright (c) 2010-2014, Salvatore Sanfilippo <antirez at gmail dot com>
+Copyright (c) 2010-2013, Pieter Noordhuis <pcnoordhuis at gmail dot com>
+Copyright (c) 2025, Eric Curtin <ericcurtin17 at gmail dot com>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice,
+  this list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/examples/run/linenoise.cpp/linenoise.cpp
+++ b/examples/run/linenoise.cpp/linenoise.cpp
--- a/examples/run/linenoise.cpp/linenoise.h
+++ b/examples/run/linenoise.cpp/linenoise.h
@ -0,0 +1,114 @@
+/* linenoise.h -- VERSION 1.0
+ *
+ * Guerrilla line editing library against the idea that a line editing lib
+ * needs to be 20,000 lines of C++ code.
+ *
+ * See linenoise.cpp for more information.
+ *
+ * ------------------------------------------------------------------------
+ *
+ * Copyright (c) 2010-2023, Salvatore Sanfilippo <antirez at gmail dot com>
+ * Copyright (c) 2010-2013, Pieter Noordhuis <pcnoordhuis at gmail dot com>
+ * Copyright (c) 2025, Eric Curtin <ericcurtin17 at gmail dot com>
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *  *  Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *
+ *  *  Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __LINENOISE_H
+#define __LINENOISE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stddef.h> /* For size_t. */
+
+extern const char *linenoiseEditMore;
+
+/* The linenoiseState structure represents the state during line editing.
+ * We pass this state to functions implementing specific editing
+ * functionalities. */
+struct linenoiseState {
+    int in_completion;  /* The user pressed TAB and we are now in completion
+                         * mode, so input is handled by completeLine(). */
+    size_t completion_idx; /* Index of next completion to propose. */
+    int ifd;            /* Terminal stdin file descriptor. */
+    int ofd;            /* Terminal stdout file descriptor. */
+    char *buf;          /* Edited line buffer. */
+    size_t buflen;      /* Edited line buffer size. */
+    const char *prompt; /* Prompt to display. */
+    size_t plen;        /* Prompt length. */
+    size_t pos;         /* Current cursor position. */
+    size_t oldpos;      /* Previous refresh cursor position. */
+    size_t len;         /* Current edited line length. */
+    size_t cols;        /* Number of columns in terminal. */
+    size_t oldrows;     /* Rows used by last refrehsed line (multiline mode) */
+    int history_index;  /* The history index we are currently editing. */
+};
+
+typedef struct linenoiseCompletions {
+  size_t len;
+  char **cvec;
+} linenoiseCompletions;
+
+/* Non blocking API. */
+int linenoiseEditStart(struct linenoiseState *l, int stdin_fd, int stdout_fd, char *buf, size_t buflen, const char *prompt);
+const char *linenoiseEditFeed(struct linenoiseState *l);
+void linenoiseEditStop(struct linenoiseState *l);
+void linenoiseHide(struct linenoiseState *l);
+void linenoiseShow(struct linenoiseState *l);
+
+/* Blocking API. */
+const char *linenoise(const char *prompt);
+void linenoiseFree(void *ptr);
+
+/* Completion API. */
+typedef void(linenoiseCompletionCallback)(const char *, linenoiseCompletions *);
+typedef const char*(linenoiseHintsCallback)(const char *, int *color, int *bold);
+typedef void(linenoiseFreeHintsCallback)(const char *);
+void linenoiseSetCompletionCallback(linenoiseCompletionCallback *);
+void linenoiseSetHintsCallback(linenoiseHintsCallback *);
+void linenoiseSetFreeHintsCallback(linenoiseFreeHintsCallback *);
+void linenoiseAddCompletion(linenoiseCompletions *, const char *);
+
+/* History API. */
+int linenoiseHistoryAdd(const char *line);
+int linenoiseHistorySetMaxLen(int len);
+int linenoiseHistorySave(const char *filename);
+int linenoiseHistoryLoad(const char *filename);
+
+/* Other utilities. */
+void linenoiseClearScreen(void);
+void linenoiseSetMultiLine(int ml);
+void linenoisePrintKeyCodes(void);
+void linenoiseMaskModeEnable(void);
+void linenoiseMaskModeDisable(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __LINENOISE_H */
--- a/examples/run/run.cpp
+++ b/examples/run/run.cpp
@ -19,12 +19,14 @@
 #include <cstring>
 #include <filesystem>
 #include <iostream>
+#include <list>
 #include <sstream>
 #include <string>
 #include <vector>

 #include "common.h"
 #include "json.hpp"
+#include "linenoise.cpp/linenoise.h"
 #include "llama-cpp.h"

 #if defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) || defined(_WIN32)
@ -536,7 +538,7 @@ class LlamaData {
    llama_sampler_ptr               sampler;
    llama_context_ptr               context;
    std::vector<llama_chat_message> messages;
-    std::vector<std::string>        msg_strs;
+    std::list<std::string>          msg_strs;
    std::vector<char>               fmtted;

    int init(Opt & opt) {
@ -807,24 +809,44 @@ static int generate(LlamaData & llama_data, const std::string & prompt, std::str
        batch = llama_batch_get_one(&new_token_id, 1);
    }

+    printf("\033[0m");
    return 0;
 }

-static int read_user_input(std::string & user) {
-    std::getline(std::cin, user);
+static int read_user_input(std::string & user_input) {
+    static const char * prompt_prefix = "> ";
+#ifdef WIN32
+    printf(
+        "\r%*s"
+        "\r\033[0m%s",
+        get_terminal_width(), " ", prompt_prefix);
+
+    std::getline(std::cin, user_input);
    if (std::cin.eof()) {
        printf("\n");
        return 1;
    }
-
-    if (user == "/bye") {
+#else
+    std::unique_ptr<char, decltype(&std::free)> line(const_cast<char *>(linenoise(prompt_prefix)), free);
+    if (!line) {
        return 1;
    }

-    if (user.empty()) {
+    user_input = line.get();
+#endif
+
+    if (user_input == "/bye") {
+        return 1;
+    }
+
+    if (user_input.empty()) {
        return 2;
    }

+#ifndef WIN32
+    linenoiseHistoryAdd(line.get());
+#endif
+
    return 0;  // Should have data in happy path
 }

@ -865,10 +887,6 @@ static int handle_user_input(std::string & user_input, const std::string & user)
        return 0;  // No need for interactive input
    }

-    printf(
-        "\r%*s"
-        "\r\033[32m> \033[0m",
-        get_terminal_width(), " ");
    return read_user_input(user_input);  // Returns true if input ends the loop
 }

--- a/examples/server/httplib.h
+++ b/examples/server/httplib.h
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -19,6 +19,7 @@
 #include "loading.html.hpp"

 #include <atomic>
+#include <chrono>
 #include <condition_variable>
 #include <cstddef>
 #include <cinttypes>
@ -32,6 +33,8 @@

 using json = nlohmann::ordered_json;

+constexpr int HTTP_POLLING_SECONDS = 1;
+
 enum stop_type {
    STOP_TYPE_NONE,
    STOP_TYPE_EOS,
@ -1602,6 +1605,30 @@ struct server_response {
        // should never reach here
    }

+    // same as recv(), but have timeout in seconds
+    // if timeout is reached, nullptr is returned
+    server_task_result_ptr recv_with_timeout(const std::unordered_set<int> & id_tasks, int timeout) {
+        while (true) {
+            std::unique_lock<std::mutex> lock(mutex_results);
+            bool cr_res = condition_results.wait_for(lock, std::chrono::seconds(timeout), [&]{
+                return !queue_results.empty();
+            });
+            if (!cr_res) {
+                return nullptr;
+            }
+
+            for (int i = 0; i < (int) queue_results.size(); i++) {
+                if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) {
+                    server_task_result_ptr res = std::move(queue_results[i]);
+                    queue_results.erase(queue_results.begin() + i);
+                    return res;
+                }
+            }
+        }
+
+        // should never reach here
+    }
+
    // single-task version of recv()
    server_task_result_ptr recv(int id_task) {
        std::unordered_set<int> id_tasks = {id_task};
@ -2322,10 +2349,21 @@ struct server_context {
    void receive_multi_results(
            const std::unordered_set<int> & id_tasks,
            const std::function<void(std::vector<server_task_result_ptr>&)> & result_handler,
-            const std::function<void(json)> & error_handler) {
+            const std::function<void(json)> & error_handler,
+            const std::function<bool()> & is_connection_closed) {
        std::vector<server_task_result_ptr> results(id_tasks.size());
-        for (size_t i = 0; i < id_tasks.size(); i++) {
-            server_task_result_ptr result = queue_results.recv(id_tasks);
+        for (int i = 0; i < (int)id_tasks.size(); i++) {
+            server_task_result_ptr result = queue_results.recv_with_timeout(id_tasks, HTTP_POLLING_SECONDS);
+
+            if (is_connection_closed()) {
+                cancel_tasks(id_tasks);
+                return;
+            }
+
+            if (result == nullptr) {
+                i--; // retry
+                continue;
+            }

            if (result->is_error()) {
                error_handler(result->to_json());
@ -2349,10 +2387,20 @@ struct server_context {
    void receive_cmpl_results_stream(
            const std::unordered_set<int> & id_tasks,
            const std::function<bool(server_task_result_ptr&)> & result_handler,
-            const std::function<void(json)> & error_handler) {
+            const std::function<void(json)> & error_handler,
+            const std::function<bool()> & is_connection_closed) {
        size_t n_finished = 0;
        while (true) {
-            server_task_result_ptr result = queue_results.recv(id_tasks);
+            server_task_result_ptr result = queue_results.recv_with_timeout(id_tasks, HTTP_POLLING_SECONDS);
+
+            if (is_connection_closed()) {
+                cancel_tasks(id_tasks);
+                return;
+            }
+
+            if (result == nullptr) {
+                continue; // retry
+            }

            if (result->is_error()) {
                error_handler(result->to_json());
@ -3633,6 +3681,7 @@ int main(int argc, char ** argv) {
    const auto handle_completions_impl = [&ctx_server, &res_error, &res_ok](
            server_task_type type,
            json & data,
+            std::function<bool()> is_connection_closed,
            httplib::Response & res,
            oaicompat_type oaicompat) {
        GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL);
@ -3694,7 +3743,7 @@ int main(int argc, char ** argv) {
                }
            }, [&](const json & error_data) {
                res_error(res, error_data);
-            });
+            }, is_connection_closed);

            ctx_server.queue_results.remove_waiting_task_ids(task_ids);
        } else {
@ -3704,6 +3753,7 @@ int main(int argc, char ** argv) {
                    if (res_json.is_array()) {
                        for (const auto & res : res_json) {
                            if (!server_sent_event(sink, "data", res)) {
+                                // sending failed (HTTP connection closed), cancel the generation
                                return false;
                            }
                        }
@ -3713,6 +3763,9 @@ int main(int argc, char ** argv) {
                    }
                }, [&](const json & error_data) {
                    server_sent_event(sink, "error", error_data);
+                }, [&sink]() {
+                    // note: do not use req.is_connection_closed here because req is already destroyed
+                    return !sink.is_writable();
                });
                if (oaicompat != OAICOMPAT_TYPE_NONE) {
                    static const std::string ev_done = "data: [DONE]\n\n";
@ -3735,6 +3788,7 @@ int main(int argc, char ** argv) {
        return handle_completions_impl(
            SERVER_TASK_TYPE_COMPLETION,
            data,
+            req.is_connection_closed,
            res,
            OAICOMPAT_TYPE_NONE);
    };
@ -3744,6 +3798,7 @@ int main(int argc, char ** argv) {
        return handle_completions_impl(
            SERVER_TASK_TYPE_COMPLETION,
            data,
+            req.is_connection_closed,
            res,
            OAICOMPAT_TYPE_COMPLETION);
    };
@ -3820,6 +3875,7 @@ int main(int argc, char ** argv) {
        return handle_completions_impl(
            SERVER_TASK_TYPE_INFILL,
            data,
+            req.is_connection_closed,
            res,
            OAICOMPAT_TYPE_NONE); // infill is not OAI compatible
    };
@ -3834,6 +3890,7 @@ int main(int argc, char ** argv) {
        return handle_completions_impl(
            SERVER_TASK_TYPE_COMPLETION,
            data,
+            req.is_connection_closed,
            res,
            OAICOMPAT_TYPE_CHAT);
    };
@ -3980,7 +4037,7 @@ int main(int argc, char ** argv) {
            }, [&](const json & error_data) {
                res_error(res, error_data);
                error = true;
-            });
+            }, req.is_connection_closed);

            ctx_server.queue_results.remove_waiting_task_ids(task_ids);
        }
@ -4070,7 +4127,7 @@ int main(int argc, char ** argv) {
            }, [&](const json & error_data) {
                res_error(res, error_data);
                error = true;
-            });
+            }, req.is_connection_closed);
        }

        if (error) {
--- a/examples/server/tests/unit/test_completion.py
+++ b/examples/server/tests/unit/test_completion.py
@ -1,4 +1,5 @@
 import pytest
+import requests
 import time
 from openai import OpenAI
 from utils import *
@ -405,3 +406,23 @@ def test_n_probs_post_sampling():
            assert "bytes" in prob and type(prob["bytes"]) == list
        # because the test model usually output token with either 100% or 0% probability, we need to check all the top_probs
        assert any(prob["prob"] == 1.0 for prob in tok["top_probs"])
+
+
+def test_cancel_request():
+    global server
+    server.n_ctx = 4096
+    server.n_predict = -1
+    server.n_slots = 1
+    server.server_slots = True
+    server.start()
+    # send a request that will take a long time, but cancel it before it finishes
+    try:
+        server.make_request("POST", "/completion", data={
+            "prompt": "I believe the meaning of life is",
+        }, timeout=0.1)
+    except requests.exceptions.ReadTimeout:
+        pass # expected
+    # make sure the slot is free
+    time.sleep(1) # wait for HTTP_POLLING_SECONDS
+    res = server.make_request("GET", "/slots")
+    assert res.body[0]["is_processing"] == False
--- a/examples/server/tests/utils.py
+++ b/examples/server/tests/utils.py
@ -26,6 +26,9 @@ from re import RegexFlag
 import wget


+DEFAULT_HTTP_TIMEOUT = 10 if "LLAMA_SANITIZE" not in os.environ else 30
+
+
 class ServerResponse:
    headers: dict
    status_code: int
@ -88,7 +91,7 @@ class ServerProcess:
        if "PORT" in os.environ:
            self.server_port = int(os.environ["PORT"])

-    def start(self, timeout_seconds: int = 10) -> None:
+    def start(self, timeout_seconds: int | None = DEFAULT_HTTP_TIMEOUT) -> None:
        if "LLAMA_SERVER_BIN_PATH" in os.environ:
            server_path = os.environ["LLAMA_SERVER_BIN_PATH"]
        elif os.name == "nt":
@ -219,17 +222,18 @@ class ServerProcess:
        path: str,
        data: dict | Any | None = None,
        headers: dict | None = None,
+        timeout: float | None = None,
    ) -> ServerResponse:
        url = f"http://{self.server_host}:{self.server_port}{path}"
        parse_body = False
        if method == "GET":
-            response = requests.get(url, headers=headers)
+            response = requests.get(url, headers=headers, timeout=timeout)
            parse_body = True
        elif method == "POST":
-            response = requests.post(url, headers=headers, json=data)
+            response = requests.post(url, headers=headers, json=data, timeout=timeout)
            parse_body = True
        elif method == "OPTIONS":
-            response = requests.options(url, headers=headers)
+            response = requests.options(url, headers=headers, timeout=timeout)
        else:
            raise ValueError(f"Unimplemented method: {method}")
        result = ServerResponse()
--- a/examples/simple-chat/simple-chat.cpp
+++ b/examples/simple-chat/simple-chat.cpp
@ -95,11 +95,11 @@ int main(int argc, char ** argv) {
    llama_sampler_chain_add(smpl, llama_sampler_init_dist(LLAMA_DEFAULT_SEED));

    // helper function to evaluate a prompt and generate a response
-    auto generate = [&](const std::string & prompt) {
+    auto generate = [&](const std::string & prompt, bool is_first) {
        std::string response;

        // tokenize the prompt
-        const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, true, true);
+        const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
        std::vector<llama_token> prompt_tokens(n_prompt_tokens);
        if (llama_tokenize(vocab, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), llama_get_kv_cache_used_cells(ctx) == 0, true) < 0) {
            GGML_ABORT("failed to tokenize the prompt\n");
@ -180,7 +180,7 @@ int main(int argc, char ** argv) {

        // generate a response
        printf("\033[33m");
-        std::string response = generate(prompt);
+        std::string response = generate(prompt, prev_len == 0);
        printf("\n\033[0m");

        // add the response to the messages
--- a/examples/tts/tts.cpp
+++ b/examples/tts/tts.cpp
@ -425,6 +425,33 @@ static void prompt_init(llama_tokens & prompt, const llama_vocab * vocab) {
    prompt_add(prompt, vocab, "<|im_start|>\n", true, true);
 }

+static std::vector<llama_token> prepare_guide_tokens(const llama_vocab * vocab, const std::string & str) {
+    const std::string& delimiter = "<|text_sep|>";
+
+    std::vector<llama_token> result;
+    size_t start = 0;
+    size_t end = str.find(delimiter);
+
+    //first token is always a newline, as it was not previously added
+    result.push_back(common_tokenize(vocab, "\n", false, true)[0]);
+
+    while (end != std::string::npos) {
+        std::string current_word = str.substr(start, end - start);
+        auto tmp = common_tokenize(vocab, current_word, false, true);
+        result.push_back(tmp[0]);
+        start = end + delimiter.length();
+        end = str.find(delimiter, start);
+    }
+
+    // Add the last part
+    std::string current_word = str.substr(start);
+    auto tmp = common_tokenize(vocab, current_word, false, true);
+    if (tmp.size() > 0) {
+        result.push_back(tmp[0]);
+    }
+    return result;
+}
+
 int main(int argc, char ** argv) {
    common_params params;

@ -494,6 +521,7 @@ int main(int argc, char ** argv) {
    const auto t_main_start = ggml_time_us();

    std::vector<llama_token> codes;
+    std::vector<llama_token> guide_tokens;

    // process prompt and generate voice codes
    {
@ -508,6 +536,9 @@ int main(int argc, char ** argv) {
        // convert the input text into the necessary format expected by OuteTTS
        {
            std::string prompt_clean = process_text(params.prompt);
+            if (params.vocoder.use_guide_tokens) {
+                guide_tokens = prepare_guide_tokens(vocab, prompt_clean);
+            }

            LOG_INF("%s: prompt: '%s'\n", __func__, prompt_clean.c_str());

@ -717,6 +748,8 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
        int n_past   = batch.n_tokens;
        int n_decode = 0;

+        bool next_token_uses_guide_token = true;
+
        while (n_decode <= n_predict) {
            // prepare the next batch
            common_batch_clear(batch);
@ -728,7 +761,17 @@ lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|14
                    continue;
                }

-                const llama_token new_token_id = common_sampler_sample(smpl[i], ctx_ttc, i_batch[i]);
+                llama_token new_token_id = common_sampler_sample(smpl[i], ctx_ttc, i_batch[i]);
+
+                //guide tokens help prevent hallucinations by forcing the TTS to use the correct word
+                if (!guide_tokens.empty() && next_token_uses_guide_token && !llama_vocab_is_control(vocab, new_token_id) && !llama_vocab_is_eog(vocab, new_token_id)) {
+                    llama_token guide_token = guide_tokens[0];
+                    guide_tokens.erase(guide_tokens.begin());
+                    new_token_id = guide_token; //ensure correct word fragment is used
+                }
+
+                //this is the token id that always precedes a new word
+                next_token_uses_guide_token = (new_token_id == 198);

                common_sampler_accept(smpl[i], new_token_id, true);

--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@ -333,8 +333,12 @@ struct ggml_backend_sycl_context {
    // pool
    std::unique_ptr<ggml_sycl_pool> pools[GGML_SYCL_MAX_DEVICES];

+    std::unique_ptr<ggml_sycl_pool> host_pools[GGML_SYCL_MAX_DEVICES];
+
    static std::unique_ptr<ggml_sycl_pool> new_pool_for_device(queue_ptr qptr, int device);

+    static std::unique_ptr<ggml_sycl_pool> new_pool_for_host(queue_ptr qptr, int device);
+
    ggml_sycl_pool & pool(int device) {
        if (pools[device] == nullptr) {
            pools[device] = new_pool_for_device(stream(device,0), device);
@ -345,6 +349,15 @@ struct ggml_backend_sycl_context {
    ggml_sycl_pool & pool() {
        return pool(device);
    }
+
+    ggml_sycl_pool & host_pool(int device) {
+        if (host_pools[device] == nullptr) {
+            host_pools[device] = new_pool_for_host(stream(device, 0), device);
+        }
+        return *host_pools[device];
+    }
+
+    ggml_sycl_pool & host_pool() { return host_pool(device); }
 };

 // common device functions
--- a/ggml/src/ggml-sycl/dpct/helper.hpp
+++ b/ggml/src/ggml-sycl/dpct/helper.hpp
@ -82,6 +82,14 @@ inline std::string get_device_backend_and_type(const sycl::device &device) {
    return device_type.str();
 }

+template <typename Ts> struct matrix_info_t {
+    oneapi::mkl::transpose transpose_info[2];
+    Ts                     value_info[2];
+    std::int64_t           size_info[3];
+    std::int64_t           ld_info[3];
+    std::int64_t           groupsize_info;
+};
+
 namespace dpct
 {
    typedef sycl::queue *queue_ptr;
@ -1727,26 +1735,13 @@ namespace dpct
        };

        template <class Ta, class Tb, class Tc, class Ts>
-        inline void gemm_batch_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
-                                    oneapi::mkl::transpose b_trans, int m, int n, int k,
-                                    const void *alpha, const void **a, int lda,
-                                    const void **b, int ldb, const void *beta, void **c,
-                                    int ldc, int batch_size)
-        {
-            struct matrix_info_t
-            {
-                oneapi::mkl::transpose transpose_info[2];
-                Ts value_info[2];
-                std::int64_t size_info[3];
-                std::int64_t ld_info[3];
-                std::int64_t groupsize_info;
-            };
-
+        inline void gemm_batch_impl(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans,
+                                    int m, int n, int k, const void * alpha, const void ** a, int lda, const void ** b,
+                                    int ldb, const void * beta, void ** c, int ldc, int batch_size,
+                                    matrix_info_t<float> * matrix_info) {
            Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
            Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);

-            matrix_info_t *matrix_info =
-                (matrix_info_t *)std::malloc(sizeof(matrix_info_t));
            matrix_info->transpose_info[0] = a_trans;
            matrix_info->transpose_info[1] = b_trans;
            matrix_info->value_info[0] = alpha_value;
@ -1763,23 +1758,18 @@ namespace dpct
            sycl::event e = oneapi::mkl::blas::column_major::gemm_batch(
                oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ q }, matrix_info->transpose_info,
                matrix_info->transpose_info + 1, matrix_info->size_info, matrix_info->size_info + 1,
-                matrix_info->size_info + 2, matrix_info->value_info, reinterpret_cast<const Ta **>(a),
-                matrix_info->ld_info, reinterpret_cast<const Tb **>(b), matrix_info->ld_info + 1,
-                matrix_info->value_info + 1, reinterpret_cast<Tc **>(c), matrix_info->ld_info + 2, 1,
-                &(matrix_info->groupsize_info));
+                matrix_info->size_info + 2, reinterpret_cast<Ts *>(matrix_info->value_info),
+                reinterpret_cast<const Ta **>(a), matrix_info->ld_info, reinterpret_cast<const Tb **>(b),
+                matrix_info->ld_info + 1, reinterpret_cast<Ts *>(matrix_info->value_info + 1),
+                reinterpret_cast<Tc **>(c), matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
 #else
            sycl::event e = oneapi::mkl::blas::column_major::gemm_batch(
                q, matrix_info->transpose_info, matrix_info->transpose_info + 1, matrix_info->size_info,
-                matrix_info->size_info + 1, matrix_info->size_info + 2, matrix_info->value_info,
+                matrix_info->size_info + 1, matrix_info->size_info + 2, reinterpret_cast<Ts *>(matrix_info->value_info),
                reinterpret_cast<const Ta **>(a), matrix_info->ld_info, reinterpret_cast<const Tb **>(b),
-                matrix_info->ld_info + 1, matrix_info->value_info + 1, reinterpret_cast<Tc **>(c),
-                matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
+                matrix_info->ld_info + 1, reinterpret_cast<Ts *>(matrix_info->value_info + 1),
+                reinterpret_cast<Tc **>(c), matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
 #endif
-
-            q.submit([&](sycl::handler &cgh)
-                     {
-    cgh.depends_on(e);
-    cgh.host_task([=] { std::free(matrix_info); }); });
        }

        template <class Ta, class Tb, class Tc, class Ts>
@ -2422,25 +2412,11 @@ namespace dpct
    /// \param [in] ldc Leading dimension of C.
    /// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
    /// \param [in] scaling_type Data type of the scaling factors.
-    inline void gemm_batch(sycl::queue &q, oneapi::mkl::transpose a_trans,
-                           oneapi::mkl::transpose b_trans, int m, int n, int k,
-                           const void *alpha, const void *a[],
-                           library_data_t a_type, int lda, const void *b[],
-                           library_data_t b_type, int ldb, const void *beta,
-                           void *c[], library_data_t c_type, int ldc,
-                           int batch_size, library_data_t scaling_type)
-    {
-        if (scaling_type == library_data_t::real_float &&
-            c_type == library_data_t::complex_float)
-        {
-            scaling_type = library_data_t::complex_float;
-        }
-        else if (scaling_type == library_data_t::real_double &&
-                 c_type == library_data_t::complex_double)
-        {
-            scaling_type = library_data_t::complex_double;
-        }
-
+    inline void gemm_batch(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans, int m,
+                           int n, int k, const void * alpha, const void * a[], library_data_t a_type, int lda,
+                           const void * b[], library_data_t b_type, int ldb, const void * beta, void * c[],
+                           library_data_t c_type, int ldc, int batch_size, library_data_t scaling_type,
+                           matrix_info_t<float> * matrix_info) {
        std::uint64_t key =
            detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
        switch (key)
@ -2449,48 +2425,24 @@ namespace dpct
            library_data_t::real_float, library_data_t::real_float,
            library_data_t::real_float, library_data_t::real_float):
        {
-            detail::gemm_batch_impl<float, float, float, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                batch_size);
+            detail::gemm_batch_impl<float, float, float, float>(q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb,
+                                                                beta, c, ldc, batch_size, matrix_info);
            break;
        }
        case detail::get_type_combination_id(
            library_data_t::real_double, library_data_t::real_double,
            library_data_t::real_double, library_data_t::real_double):
        {
-            detail::gemm_batch_impl<double, double, double, double>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::complex_float, library_data_t::complex_float,
-            library_data_t::complex_float, library_data_t::complex_float):
-        {
-            detail::gemm_batch_impl<std::complex<float>, std::complex<float>,
-                                    std::complex<float>, std::complex<float>>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::complex_double, library_data_t::complex_double,
-            library_data_t::complex_double, library_data_t::complex_double):
-        {
-            detail::gemm_batch_impl<std::complex<double>, std::complex<double>,
-                                    std::complex<double>, std::complex<double>>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                batch_size);
+            detail::gemm_batch_impl<double, double, double, double>(q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb,
+                                                                    beta, c, ldc, batch_size, matrix_info);
            break;
        }
        case detail::get_type_combination_id(
            library_data_t::real_half, library_data_t::real_half,
            library_data_t::real_half, library_data_t::real_half):
        {
-            detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half,
-                                    sycl::half>(q, a_trans, b_trans, m, n, k, alpha,
-                                                a, lda, b, ldb, beta, c, ldc,
-                                                batch_size);
+            detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
            break;
        }
 #ifdef __INTEL_MKL__
@ -2498,19 +2450,16 @@ namespace dpct
            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
            library_data_t::real_bfloat16, library_data_t::real_float):
        {
-            detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
-                                    oneapi::mkl::bfloat16, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                batch_size);
+            detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
            break;
        }
        case detail::get_type_combination_id(
            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
            library_data_t::real_float, library_data_t::real_float):
        {
-            detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
-                                    float>(q, a_trans, b_trans, m, n, k, alpha, a, lda,
-                                           b, ldb, beta, c, ldc, batch_size);
+            detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
            break;
        }
 #endif
@ -2522,10 +2471,9 @@ namespace dpct
                dpct::get_value(reinterpret_cast<const std::int32_t *>(alpha), q);
            float beta_float =
                dpct::get_value(reinterpret_cast<const std::int32_t *>(beta), q);
-            detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t,
-                                    float>(q, a_trans, b_trans, m, n, k, &alpha_float,
-                                           a, lda, b, ldb, &beta_float, c, ldc,
-                                           batch_size);
+            detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t, float>(
+                q, a_trans, b_trans, m, n, k, &alpha_float, a, lda, b, ldb, &beta_float, c, ldc, batch_size,
+                matrix_info);
            break;
        }
        case detail::get_type_combination_id(
@ -2533,8 +2481,7 @@ namespace dpct
            library_data_t::real_float, library_data_t::real_float):
        {
            detail::gemm_batch_impl<std::int8_t, std::int8_t, float, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                batch_size);
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
            break;
        }
        case detail::get_type_combination_id(
@ -2542,8 +2489,7 @@ namespace dpct
            library_data_t::real_float, library_data_t::real_float):
        {
            detail::gemm_batch_impl<sycl::half, sycl::half, float, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                batch_size);
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
            break;
        }
        case detail::get_type_combination_id(
@ -2557,8 +2503,7 @@ namespace dpct
            sycl::half alpha_half(alpha_value);
            sycl::half beta_half(beta_value);
            detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
-                q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, b, ldb, &beta_half, c, ldc,
-                batch_size);
+                q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, b, ldb, &beta_half, c, ldc, batch_size, matrix_info);
            break;
        }
        default:
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@ -1173,6 +1173,85 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
    }
 };

+struct ggml_sycl_pool_host : public ggml_sycl_pool {
+    queue_ptr qptr;
+    int       device;
+
+    inline static int counter{ 0 };
+
+    struct ggml_sycl_buffer {
+        void * ptr  = nullptr;
+        size_t size = 0;
+    };
+
+    // Set arbitrarly to 64
+    static constexpr int          MAX_POOL_SIZE{ 64 };
+    std::vector<ggml_sycl_buffer> buffer_pool = std::vector<ggml_sycl_buffer>(MAX_POOL_SIZE);
+    size_t                        pool_size   = 0;
+
+    explicit ggml_sycl_pool_host(queue_ptr qptr_, int device_) : qptr(qptr_), device(device_) {}
+
+    ~ggml_sycl_pool_host() {
+        for (int i = 0; i < MAX_POOL_SIZE; ++i) {
+            ggml_sycl_buffer & b = buffer_pool[i];
+            if (b.ptr != nullptr) {
+                SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(b.ptr, *qptr)));
+                b.ptr = nullptr;
+                pool_size -= b.size;
+                b.size = 0;
+            }
+        }
+        counter = 0;
+    }
+
+    void * alloc(size_t size, size_t * actual_size) override {
+        if (counter == MAX_POOL_SIZE) {
+            ggml_sycl_buffer b               = buffer_pool[0];
+            void *           ptr             = b.ptr;
+            *actual_size                     = b.size;
+            counter                          = 1;
+            return ptr;
+        }
+        ggml_sycl_buffer & b = buffer_pool[counter];
+
+        if (b.ptr == nullptr) {
+            void * ptr;
+
+            SYCL_CHECK(CHECK_TRY_ERROR(ptr = (void *) sycl::malloc_host(size, *qptr)));
+            if (!ptr) {
+                GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on host\n", __func__, size);
+                return nullptr;
+            }
+            pool_size += size;
+            *actual_size = size;
+            counter      = counter + 1;
+            return ptr;
+        } else {
+            ++counter;
+            b.size = size;
+            return b.ptr;
+        }
+    }
+
+    void free(void * ptr, size_t size) override {
+        // if the pool is not completed add the pointer to it in place of the first nullptr found.
+        // Otherwise do nothing, pointers will be freed once the pool is deallocated.
+        for (int i = 0; i < MAX_POOL_SIZE; ++i) {
+            ggml_sycl_buffer & b = buffer_pool[i];
+            if (b.ptr == nullptr) {
+                b.ptr  = ptr;
+                b.size = size;
+                return;
+            }
+        }
+    }
+};
+
+std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_host(queue_ptr qptr, int device) {
+    // return pool for the host to speed up memory management
+    return std::unique_ptr<ggml_sycl_pool>(new ggml_sycl_pool_host(qptr, device));
+}
+
 std::unique_ptr<ggml_sycl_pool> ggml_backend_sycl_context::new_pool_for_device(queue_ptr qptr, int device) {
    // TBD: NO VMM support
    // if (ggml_sycl_info().devices[device].vmm) {
@ -3363,6 +3442,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,

        ggml_sycl_pool_alloc<const void *> ptrs_src(ctx.pool(), 2*ne23);
        ggml_sycl_pool_alloc<      void *> ptrs_dst(ctx.pool(), 1*ne23);
+        ggml_sycl_pool_alloc<matrix_info_t<float>> matrix_info(ctx.host_pool(), 1);

        sycl::range<3> block_dims(1, ne12, ne13);
        /*
@ -3391,14 +3471,10 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
            });
        }
        SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
-            *main_stream, oneapi::mkl::transpose::trans,
-            oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
-            (const void **)(ptrs_src.get() + 0 * ne23),
-            dpct::library_data_t::real_half, nb01 / nb00,
-            (const void **)(ptrs_src.get() + 1 * ne23),
-            dpct::library_data_t::real_half, nb11 / nb10, beta,
-            (void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23,
-            cu_compute_type)));
+            *main_stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
+            (const void **) (ptrs_src.get() + 0 * ne23), dpct::library_data_t::real_half, nb01 / nb00,
+            (const void **) (ptrs_src.get() + 1 * ne23), dpct::library_data_t::real_half, nb11 / nb10, beta,
+            (void **) (ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23, cu_compute_type, matrix_info.get())));
    }
 }
 catch (sycl::exception const &exc) {
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@ -386,10 +386,13 @@ struct vk_flash_attn_push_constants {
    uint32_t nev3;
    uint32_t nem1;

+    uint32_t nb01;
    uint32_t nb02;
    uint32_t nb03;
+    uint32_t nb11;
    uint32_t nb12;
    uint32_t nb13;
+    uint32_t nb21;
    uint32_t nb22;
    uint32_t nb23;
    uint32_t nb31;
@ -4809,7 +4812,14 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
    }
    assert(pipelines);

-    bool aligned = (KV % pipelines[1]->align) == 0;
+    const uint32_t q_stride = (uint32_t)(nbq1 / ggml_type_size(q->type));
+    const uint32_t k_stride = (uint32_t)(nbk1 / ggml_type_size(k->type));
+    const uint32_t v_stride = (uint32_t)(nbv1 / ggml_type_size(v->type));
+
+    bool aligned = (KV % pipelines[1]->align) == 0 &&
+                   // the "aligned" shader variant will forcibly align strides, for performance
+                   (q_stride & 7) == 0 && (k_stride & 7) == 0 && (v_stride & 7) == 0;
+
    vk_pipeline pipeline = pipelines[aligned];
    assert(pipeline);

@ -4845,15 +4855,15 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx

    if (ctx->device->uma) {
        ggml_vk_host_get(ctx->device, q->data, d_Q, q_buf_offset);
-        ggml_vk_host_get(ctx->device, k->data, d_K, q_buf_offset);
-        ggml_vk_host_get(ctx->device, v->data, d_V, q_buf_offset);
-        ggml_vk_host_get(ctx->device, dst->data, d_D, q_buf_offset);
+        ggml_vk_host_get(ctx->device, k->data, d_K, k_buf_offset);
+        ggml_vk_host_get(ctx->device, v->data, d_V, v_buf_offset);
+        ggml_vk_host_get(ctx->device, dst->data, d_D, d_buf_offset);
        Q_uma = d_Q != nullptr;
        K_uma = d_K != nullptr;
        V_uma = d_V != nullptr;
        D_uma = d_D != nullptr;
        if (mask) {
-            ggml_vk_host_get(ctx->device, mask->data, d_M, q_buf_offset);
+            ggml_vk_host_get(ctx->device, mask->data, d_M, m_buf_offset);
            M_uma = d_M != nullptr;
        }
    }
@ -4891,7 +4901,18 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
        }
    }

-    const vk_flash_attn_push_constants pc = { N, KV, (uint32_t)ne1, (uint32_t)ne2, (uint32_t)ne3, (uint32_t)neq2, (uint32_t)neq3, (uint32_t)nek2, (uint32_t)nek3, (uint32_t)nev2, (uint32_t)nev3, nem1, (uint32_t)nbq2, (uint32_t)nbq3, (uint32_t)nbk2, (uint32_t)nbk3, (uint32_t)nbv2, (uint32_t)nbv3, nbm1, scale, max_bias, logit_softcap, mask != nullptr, n_head_log2, m0, m1 };
+    const vk_flash_attn_push_constants pc = { N, KV,
+                                              (uint32_t)ne1, (uint32_t)ne2, (uint32_t)ne3,
+                                              (uint32_t)neq2, (uint32_t)neq3,
+                                              (uint32_t)nek2, (uint32_t)nek3,
+                                              (uint32_t)nev2, (uint32_t)nev3,
+                                              nem1,
+                                              q_stride, (uint32_t)nbq2, (uint32_t)nbq3,
+                                              k_stride, (uint32_t)nbk2, (uint32_t)nbk3,
+                                              v_stride, (uint32_t)nbv2, (uint32_t)nbv3,
+                                              nbm1,
+                                              scale, max_bias, logit_softcap,
+                                              mask != nullptr, n_head_log2, m0, m1 };
    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline,
                                {
                                    vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE},
@ -8668,6 +8689,7 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) {
    ggml_tensor * src0 = tensor->src[0];
    ggml_tensor * src1 = tensor->src[1];
    ggml_tensor * src2 = tensor->src[2];
+    ggml_tensor * src3 = tensor->src[3];

    void * tensor_data = tensor->data;

@ -8730,6 +8752,9 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) {
                        if (src2 != nullptr) {
                            std::cerr << "src2=" << src2 << " src2->name=" << src2->name << " op=" << ggml_op_name(src2->op) << " type=" << ggml_type_name(src2->type) << " ne0=" << src2->ne[0] << " nb0=" << src2->nb[0] << " ne1=" << src2->ne[1] << " nb1=" << src2->nb[1] << " ne2=" << src2->ne[2] << " nb2=" << src2->nb[2] << " ne3=" << src2->ne[3] << " nb3=" << src2->nb[3] << " offset=" << src2->view_offs << std::endl;
                        }
+                        if (src3 != nullptr) {
+                            std::cerr << "src3=" << src3 << " src3->name=" << src3->name << " op=" << ggml_op_name(src3->op) << " type=" << ggml_type_name(src3->type) << " ne0=" << src3->ne[0] << " nb0=" << src3->nb[0] << " ne1=" << src3->ne[1] << " nb1=" << src3->nb[1] << " ne2=" << src3->ne[2] << " nb2=" << src3->nb[2] << " ne3=" << src3->ne[3] << " nb3=" << src3->nb[3] << " offset=" << src3->view_offs << std::endl;
+                        }
                        std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct  << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
                        std::cerr << std::endl << "Result:" << std::endl;
                        ggml_vk_print_tensor_area(tensor, tensor_data, i0, i1, i2, i3);
@ -8774,6 +8799,9 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) {
        if (src2 != nullptr) {
            std::cerr << "src2=" << src2 << " op=" << ggml_op_name(src2->op) << " type=" << ggml_type_name(src2->type) << " ne0=" << src2->ne[0] << " nb0=" << src2->nb[0] << " ne1=" << src2->ne[1] << " nb1=" << src2->nb[1] << " ne2=" << src2->ne[2] << " nb2=" << src2->nb[2] << " ne3=" << src2->ne[3] << " nb3=" << src2->nb[3] << " offset=" << src2->view_offs << std::endl;
        }
+        if (src3 != nullptr) {
+            std::cerr << "src3=" << src3 << " op=" << ggml_op_name(src3->op) << " type=" << ggml_type_name(src3->type) << " ne0=" << src3->ne[0] << " nb0=" << src3->nb[0] << " ne1=" << src3->ne[1] << " nb1=" << src3->nb[1] << " ne2=" << src3->ne[2] << " nb2=" << src3->nb[2] << " ne3=" << src3->ne[3] << " nb3=" << src3->nb[3] << " offset=" << src3->view_offs << std::endl;
+        }
        std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct  << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
        std::cerr << std::endl << "Result:" << std::endl;
        ggml_vk_print_tensor_area(tensor, tensor_data, 5, 5, 0, 0);
@ -8796,6 +8824,9 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) {
        if (src2 != nullptr) {
            std::cerr << "src2=" << src2 << " op=" << ggml_op_name(src2->op) << " type=" << ggml_type_name(src2->type) << " ne0=" << src2->ne[0] << " nb0=" << src2->nb[0] << " ne1=" << src2->ne[1] << " nb1=" << src2->nb[1] << " ne2=" << src2->ne[2] << " nb2=" << src2->nb[2] << " ne3=" << src2->ne[3] << " nb3=" << src2->nb[3] << " offset=" << src2->view_offs << std::endl;
        }
+        if (src3 != nullptr) {
+            std::cerr << "src3=" << src3 << " op=" << ggml_op_name(src3->op) << " type=" << ggml_type_name(src3->type) << " ne0=" << src3->ne[0] << " nb0=" << src3->nb[0] << " ne1=" << src3->ne[1] << " nb1=" << src3->nb[1] << " ne2=" << src3->ne[2] << " nb2=" << src3->nb[2] << " ne3=" << src3->ne[3] << " nb3=" << src3->nb[3] << " offset=" << src3->view_offs << std::endl;
+        }
        std::cerr << "First error: result=" << first_error_result << " correct=" << first_error_correct  << " i3=" << first_error[3] << " i2=" << first_error[2] << " i1=" << first_error[1] << " i0=" << first_error[0] << std::endl;
        std::cerr << std::endl << "Result:" << std::endl;
        ggml_vk_print_tensor_area(tensor, tensor_data, first_error[0], first_error[1], first_error[2], first_error[3]);
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp
@ -42,10 +42,13 @@ layout (push_constant) uniform parameter {
    uint32_t nev3;
    uint32_t nem1;

+    uint32_t nb01;
    uint32_t nb02;
    uint32_t nb03;
+    uint32_t nb11;
    uint32_t nb12;
    uint32_t nb13;
+    uint32_t nb21;
    uint32_t nb22;
    uint32_t nb23;
    uint32_t nb31;
@ -146,6 +149,23 @@ void main() {
    tensorLayoutK = setTensorLayoutDimensionNV(tensorLayoutK, KV, D);
    tensorLayoutV = setTensorLayoutDimensionNV(tensorLayoutV, KV, D);

+    // nb?1 are already divided by the type size and are in units of elements
+    uint32_t q_stride = p.nb01;
+    uint32_t k_stride = p.nb11;
+    uint32_t v_stride = p.nb21;
+    // hint to the compiler that strides are aligned for the aligned variant of the shader
+    if (Clamp != gl_CooperativeMatrixClampModeConstantNV)
+    {
+        q_stride &= ~7;
+#if !defined(BLOCK_SIZE)
+        k_stride &= ~7;
+        v_stride &= ~7;
+#endif
+    }
+    tensorLayoutQ = setTensorLayoutStrideNV(tensorLayoutQ, q_stride, 1);
+    tensorLayoutK = setTensorLayoutStrideNV(tensorLayoutK, k_stride, 1);
+    tensorLayoutV = setTensorLayoutStrideNV(tensorLayoutV, v_stride, 1);
+
    coopmat<Q_TYPE, gl_ScopeWorkgroup, Br, D, gl_MatrixUseA> Q;
    coopmat<float16_t, gl_ScopeWorkgroup, Br, D, gl_MatrixUseA> Qf16;

--- a/ggml/src/gguf.cpp
+++ b/ggml/src/gguf.cpp
@ -648,6 +648,10 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par

            ok = ok && data != nullptr;

+            if (ok) {
+                ggml_set_name(data, "GGUF tensor data binary blob");
+            }
+
            // read the binary blob with the tensor data
            ok = ok && gr.read(data->data, ctx->size);

--- a/scripts/hf.sh
+++ b/scripts/hf.sh
@ -0,0 +1,112 @@
+#!/bin/bash
+#
+# Shortcut for downloading HF models
+#
+# Usage:
+#   ./llama-cli -m $(./scripts/hf.sh https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
+#   ./llama-cli -m $(./scripts/hf.sh --url https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/blob/main/mixtral-8x7b-v0.1.Q4_K_M.gguf)
+#   ./llama-cli -m $(./scripts/hf.sh --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf)
+#
+
+# all logs go to stderr
+function log {
+    echo "$@" 1>&2
+}
+
+function usage {
+    log "Usage: $0 [[--url] <url>] [--repo <repo>] [--file <file>] [--outdir <dir> [-h|--help]"
+    exit 1
+}
+
+# check for curl or wget
+function has_cmd {
+    if ! [ -x "$(command -v $1)" ]; then
+        return 1
+    fi
+}
+
+if has_cmd wget; then
+    cmd="wget -q -c -O %s/%s %s"
+elif has_cmd curl; then
+    cmd="curl -C - -f --output-dir %s -o %s -L %s"
+else
+    log "[E] curl or wget not found"
+    exit 1
+fi
+
+url=""
+repo=""
+file=""
+outdir="."
+
+# parse args
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --url)
+            url="$2"
+            shift 2
+            ;;
+        --repo)
+            repo="$2"
+            shift 2
+            ;;
+        --file)
+            file="$2"
+            shift 2
+            ;;
+        --outdir)
+            outdir="$2"
+            shift 2
+            ;;
+        -h|--help)
+            usage
+            ;;
+        *)
+            url="$1"
+            shift
+            ;;
+    esac
+done
+
+if [ -n "$repo" ] && [ -n "$file" ]; then
+    url="https://huggingface.co/$repo/resolve/main/$file"
+fi
+
+if [ -z "$url" ]; then
+    log "[E] missing --url"
+    usage
+fi
+
+# check if the URL is a HuggingFace model, and if so, try to download it
+is_url=false
+
+if [[ ${#url} -gt 22 ]]; then
+    if [[ ${url:0:22} == "https://huggingface.co" ]]; then
+        is_url=true
+    fi
+fi
+
+if [ "$is_url" = false ]; then
+    log "[E] invalid URL, must start with https://huggingface.co"
+    exit 0
+fi
+
+# replace "blob/main" with "resolve/main"
+url=${url/blob\/main/resolve\/main}
+
+basename=$(basename $url)
+
+log "[+] attempting to download $basename"
+
+if [ -n "$cmd" ]; then
+    cmd=$(printf "$cmd" "$outdir" "$basename" "$url")
+    log "[+] $cmd"
+    if $cmd; then
+        echo $outdir/$basename
+        exit 0
+    fi
+fi
+
+log "[-] failed to download"
+
+exit 1
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@ -7,18 +7,17 @@

 #include <algorithm>
 #include <cassert>
+#include <codecvt>
 #include <cstddef>
 #include <cstdint>
+#include <locale>
 #include <map>
 #include <regex>
 #include <stdexcept>
 #include <string>
 #include <unordered_map>
-#include <unordered_set>
 #include <utility>
 #include <vector>
-#include <locale>
-#include <codecvt>

 size_t unicode_len_utf8(char src) {
    const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -1,3 +1,5 @@
+llama_add_compile_flags()
+
 function(llama_test target)
    include(CMakeParseArguments)
    set(options)
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -3046,9 +3046,10 @@ struct test_flash_attn_ext : public test_case {
    const float logit_softcap; // Gemma 2

    const ggml_type type_KV;
+    std::array<int32_t, 4> permute;

    std::string vars() override {
-        return VARS_TO_STR8(hs, nh, kv, nb, mask, max_bias, logit_softcap, type_KV);
+        return VARS_TO_STR9(hs, nh, kv, nb, mask, max_bias, logit_softcap, type_KV, permute);
    }

    double max_nmse_err() override {
@ -3063,19 +3064,33 @@ struct test_flash_attn_ext : public test_case {
    }

    test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8,
-                        bool mask = true, float max_bias = 0.0f, float logit_softcap = 0.0f, ggml_type type_KV = GGML_TYPE_F16)
-        : hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias), logit_softcap(logit_softcap), type_KV(type_KV) {}
+                        bool mask = true, float max_bias = 0.0f, float logit_softcap = 0.0f, ggml_type type_KV = GGML_TYPE_F16,
+                        std::array<int32_t, 4> permute = {0, 1, 2, 3})
+        : hs(hs), nh(nh), kv(kv), nb(nb), mask(mask), max_bias(max_bias), logit_softcap(logit_softcap), type_KV(type_KV), permute(permute) {}

    ggml_tensor * build_graph(ggml_context * ctx) override {
        const int64_t hs_padded = GGML_PAD(hs, ggml_blck_size(type_KV));

-        ggml_tensor * q = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, hs_padded, nb, nh, 1);
+        auto const &create_permuted = [&](ggml_type type, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) -> ggml_tensor * {
+            int64_t ne[4] = {ne0, ne1, ne2, ne3};
+            int64_t ne_perm[4];
+            for (int i = 0; i < 4; ++i) {
+                ne_perm[permute[i]] = ne[i];
+            }
+            ggml_tensor * t = ggml_new_tensor_4d(ctx, type, ne_perm[0], ne_perm[1], ne_perm[2], ne_perm[3]);
+            if (permute != std::array<int32_t, 4>{0, 1, 2, 3}) {
+                t = ggml_permute(ctx, t, permute[0], permute[1], permute[2], permute[3]);
+            }
+            return t;
+        };
+
+        ggml_tensor * q = create_permuted(GGML_TYPE_F32, hs_padded, nb, nh, 1);
        ggml_set_name(q, "q");

-        ggml_tensor * k = ggml_new_tensor_4d(ctx, type_KV,       hs_padded, kv, nh, 1);
+        ggml_tensor * k = create_permuted(type_KV,       hs_padded, kv, nh, 1);
        ggml_set_name(k, "k");

-        ggml_tensor * v = ggml_new_tensor_4d(ctx, type_KV,       hs_padded, kv, nh, 1);
+        ggml_tensor * v = create_permuted(type_KV,       hs_padded, kv, nh, 1);
        ggml_set_name(v, "v");

        ggml_tensor * m = nullptr;
@ -4167,6 +4182,10 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
                            for (int nb : { 1, 3, 32, 35, }) {
                                for (ggml_type type_KV : {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, GGML_TYPE_Q4_0}) {
                                    test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias, logit_softcap, type_KV));
+                                    // run fewer test cases permuted
+                                    if (mask == true && max_bias == 0.0f && logit_softcap == 0 && kv == 512) {
+                                        test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, mask, max_bias, logit_softcap, type_KV, {0, 2, 1, 3}));
+                                    }
                                }
                            }
                        }
--- a/tests/test-gguf.cpp
+++ b/tests/test-gguf.cpp
@ -48,7 +48,7 @@ enum handcrafted_file_type {
    HANDCRAFTED_DATA_CUSTOM_ALIGN          = 810 + offset_has_data,
 };

-std::string handcrafted_file_type_name(const enum handcrafted_file_type hft) {
+static std::string handcrafted_file_type_name(const enum handcrafted_file_type hft) {
    switch (hft) {
        case HANDCRAFTED_HEADER_BAD_MAGIC:           return "HEADER_BAD_MAGIC";
        case HANDCRAFTED_HEADER_BAD_VERSION_1:       return "HEADER_BAD_VERSION_1";
@ -99,7 +99,7 @@ static bool expect_context_not_null(const enum handcrafted_file_type hft) {

 typedef std::pair<enum ggml_type, std::array<int64_t, GGML_MAX_DIMS>> tensor_config_t;

-std::vector<tensor_config_t> get_tensor_configs(std::mt19937 & rng) {
+static std::vector<tensor_config_t> get_tensor_configs(std::mt19937 & rng) {
    std::vector<tensor_config_t> tensor_configs;
    tensor_configs.reserve(100);

@ -122,7 +122,7 @@ std::vector<tensor_config_t> get_tensor_configs(std::mt19937 & rng) {
    return tensor_configs;
 }

-std::vector<std::pair<enum gguf_type, enum gguf_type>> get_kv_types(std::mt19937 rng) {
+static std::vector<std::pair<enum gguf_type, enum gguf_type>> get_kv_types(std::mt19937 rng) {
    std::vector<std::pair<enum gguf_type, enum gguf_type>> kv_types;
    kv_types.reserve(100);

@ -626,8 +626,6 @@ static bool handcrafted_check_tensor_data(const gguf_context * gguf_ctx, const u

    bool ok = true;

-    const uint32_t alignment = GGUF_DEFAULT_ALIGNMENT;
-
    for (int i = 0; i < int(tensor_configs.size()); ++i) {
        const ggml_type                          type  = tensor_configs[i].first;
        const std::array<int64_t, GGML_MAX_DIMS> shape = tensor_configs[i].second;
@ -866,13 +864,13 @@ static struct random_gguf_context_result get_random_gguf_context(ggml_backend_t
                    case GGUF_TYPE_COUNT:
                    default: {
                        GGML_ABORT("fatal error");
-                    } break;
+                    }
                }
            } break;
            case GGUF_TYPE_COUNT:
            default: {
                GGML_ABORT("fatal error");
-            } break;
+            }
        }
    }

@ -938,7 +936,7 @@ static bool all_kv_in_other(const gguf_context * ctx, const gguf_context * other
        }

        if (type == GGUF_TYPE_ARRAY) {
-            const int arr_n = gguf_get_arr_n(ctx, id);
+            const size_t arr_n = gguf_get_arr_n(ctx, id);
            if (arr_n != gguf_get_arr_n(other, idx_other)) {
                ok = false;
                continue;
@ -953,7 +951,7 @@ static bool all_kv_in_other(const gguf_context * ctx, const gguf_context * other
            if (type_arr == GGUF_TYPE_BOOL) {
                const int8_t * data       = reinterpret_cast<const int8_t *>(gguf_get_arr_data(ctx,   id));
                const int8_t * data_other = reinterpret_cast<const int8_t *>(gguf_get_arr_data(other, idx_other));
-                for (int arr_i = 0; arr_i < arr_n; ++arr_i) {
+                for (size_t arr_i = 0; arr_i < arr_n; ++arr_i) {
                    if (bool(data[arr_i]) != bool(data_other[arr_i])) {
                        ok = false;
                    }
@ -962,7 +960,7 @@ static bool all_kv_in_other(const gguf_context * ctx, const gguf_context * other
            }

            if (type_arr == GGUF_TYPE_STRING) {
-                for (int arr_i = 0; arr_i < arr_n; ++arr_i) {
+                for (size_t arr_i = 0; arr_i < arr_n; ++arr_i) {
                    const std::string str       = gguf_get_arr_str(ctx,   id,       arr_i);
                    const std::string str_other = gguf_get_arr_str(other, idx_other, arr_i);
                    if (str != str_other) {
@ -1033,6 +1031,12 @@ static bool same_tensor_data(const struct ggml_context * orig, const struct ggml

    struct ggml_tensor * t_orig = ggml_get_first_tensor(orig);
    struct ggml_tensor * t_read = ggml_get_first_tensor(read);
+
+    if (std::string(t_read->name) != "GGUF tensor data binary blob") {
+        return false;
+    }
+    t_read = ggml_get_next_tensor(read, t_read);
+
    while (t_orig) {
        if (!t_read) {
            ok = false;
@ -1051,13 +1055,13 @@ static bool same_tensor_data(const struct ggml_context * orig, const struct ggml
        }

        t_orig = ggml_get_next_tensor(orig, t_orig);
-        t_read = ggml_get_next_tensor(orig, t_read);
+        t_read = ggml_get_next_tensor(read, t_read);
    }
    if (t_read) {
        ok = false;
    }

-    return true;
+    return ok;
 }

 static std::pair<int, int> test_roundtrip(ggml_backend_dev_t dev, const unsigned int seed, const bool only_meta) {
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@ -144,7 +144,6 @@ static void test_penalties(

    sampler_tester tester(probs, probs_expected);

-    const size_t n_vocab = probs.size();
    auto * sampler = llama_sampler_init_penalties(last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence);

    for (size_t i = 0; i < last_tokens.size(); i++) {