Merge branch 'master' of https://github.com/piDack/llama.cpp into support_glm_edge_model

2025-01-12 13:27:21 +01:00 · 2024-11-26 10:01:09 +00:00 · 2024-11-26 10:01:09 +00:00 · ae41d3efed
commit ae41d3efed
parent 6fc90cb727 7066b4cce2
305 changed files with 30861 additions and 28358 deletions
--- a/.clang-format
+++ b/.clang-format
@ -0,0 +1,161 @@
 ---
 Language:        Cpp
 AlignAfterOpenBracket: Align
 AlignArrayOfStructures: Left
 AlignConsecutiveAssignments: AcrossComments
 AlignConsecutiveBitFields: AcrossComments
 AlignConsecutiveDeclarations: AcrossComments
 AlignConsecutiveMacros: AcrossComments
 # AlignConsecutiveShortCaseStatements: AcrossComments
 AlignEscapedNewlines: Left # LeftWithLastLine
 AlignOperands:   Align
 AlignTrailingComments:
  Kind: Always
  OverEmptyLines: 1
 AllowAllArgumentsOnNextLine: true
 AllowAllParametersOfDeclarationOnNextLine: false
 # AllowBreakBeforeNoexceptSpecifier: OnlyWithParen
 AllowShortBlocksOnASingleLine: Never
 AllowShortCaseLabelsOnASingleLine: false
 AllowShortFunctionsOnASingleLine: Inline
 AllowShortIfStatementsOnASingleLine: Never
 AllowShortLambdasOnASingleLine: Inline
 AllowShortLoopsOnASingleLine: false
 AlwaysBreakBeforeMultilineStrings: true
 BinPackArguments: true
 BinPackParameters: true # OnePerLine
 BitFieldColonSpacing: Both
 BreakBeforeBraces: Custom # Attach
 BraceWrapping:
  AfterCaseLabel:  true
  AfterClass:      false
  AfterControlStatement: false
  AfterEnum:       false
  AfterFunction:   false
  AfterNamespace:  false
  AfterObjCDeclaration: false
  AfterStruct:     false
  AfterUnion:      false
  AfterExternBlock: false
  BeforeCatch:     false
  BeforeElse:      false
  BeforeLambdaBody: false
  BeforeWhile: false
  IndentBraces:    false
  SplitEmptyFunction: false
  SplitEmptyRecord: false
  SplitEmptyNamespace: false
 # BreakAdjacentStringLiterals: true
 BreakAfterAttributes: Never
 BreakBeforeBinaryOperators: None
 BreakBeforeInlineASMColon: OnlyMultiline
 BreakBeforeTernaryOperators: false
 # BreakBinaryOperations: Never
 BreakConstructorInitializers: AfterColon
 # BreakFunctionDefinitionParameters: false
 BreakInheritanceList: AfterComma
 BreakStringLiterals: true
 # BreakTemplateDeclarations: Yes
 ColumnLimit:     120
 CommentPragmas:  '^ IWYU pragma:'
 CompactNamespaces: false
 ConstructorInitializerIndentWidth: 4
 ContinuationIndentWidth: 4
 Cpp11BracedListStyle: false
 DerivePointerAlignment: false
 DisableFormat:   false
 EmptyLineBeforeAccessModifier: Leave
 EmptyLineAfterAccessModifier: Never
 ExperimentalAutoDetectBinPacking: false
 FixNamespaceComments: true
 IncludeBlocks:   Regroup
 IncludeCategories:
  - Regex:           '^<.*\.h>'
    Priority:        1
    SortPriority:    0
  - Regex:           '^<.*'
    Priority:        2
    SortPriority:    0
  - Regex:           '.*'
    Priority:        3
    SortPriority:    0
 IncludeIsMainRegex: '([-_](test|unittest))?$'
 IncludeIsMainSourceRegex: ''
 IndentAccessModifiers: false
 IndentCaseBlocks: true
 IndentCaseLabels: true
 IndentExternBlock: NoIndent
 IndentGotoLabels: false
 IndentPPDirectives: AfterHash
 IndentWidth:     4
 IndentWrappedFunctionNames: false
 InsertBraces:    true # NOTE: may lead to incorrect formatting
 InsertNewlineAtEOF: true
 JavaScriptQuotes: Leave
 JavaScriptWrapImports: true
 KeepEmptyLinesAtTheStartOfBlocks: false
 LambdaBodyIndentation: Signature
 LineEnding: LF
 MacroBlockBegin: ''
 MacroBlockEnd:   ''
 MaxEmptyLinesToKeep: 1
 NamespaceIndentation: None
 ObjCBinPackProtocolList: Auto
 ObjCBlockIndentWidth: 4
 ObjCSpaceAfterProperty: true
 ObjCSpaceBeforeProtocolList: true
 PPIndentWidth: -1
 PackConstructorInitializers: CurrentLine
 PenaltyBreakAssignment: 2
 PenaltyBreakBeforeFirstCallParameter: 1
 PenaltyBreakComment: 300
 PenaltyBreakFirstLessLess: 120
 PenaltyBreakString: 1000
 PenaltyBreakTemplateDeclaration: 10
 PenaltyExcessCharacter: 1000000
 PenaltyReturnTypeOnItsOwnLine: 200
 PointerAlignment: Middle
 QualifierAlignment: Left
 #QualifierOrder: ['static', 'inline', 'friend', 'constexpr', 'const', 'volatile', 'type', 'restrict']
 RawStringFormats:
  - Language:        Cpp
    Delimiters:
      - cc
      - CC
      - cpp
      - Cpp
      - CPP
      - 'c++'
      - 'C++'
    CanonicalDelimiter: ''
 ReferenceAlignment: Middle
 ReflowComments:  false # IndentOnly
 SeparateDefinitionBlocks: Always
 SortIncludes:    CaseInsensitive
 SortUsingDeclarations: LexicographicNumeric
 SpaceAfterCStyleCast: true
 SpaceAfterLogicalNot: false
 SpaceAfterTemplateKeyword: true
 SpaceBeforeAssignmentOperators: true
 SpaceBeforeCpp11BracedList: false
 SpaceBeforeCtorInitializerColon: true
 SpaceBeforeInheritanceColon: true
 SpaceBeforeParens: ControlStatements
 SpaceBeforeRangeBasedForLoopColon: true
 SpaceInEmptyBlock: false
 SpaceInEmptyParentheses: false
 SpacesBeforeTrailingComments: 2
 SpacesInAngles:  Never
 SpacesInContainerLiterals: true
 SpacesInLineCommentPrefix:
  Minimum: 1
  Maximum: -1
 SpacesInParentheses: false
 SpacesInSquareBrackets: false
 SpaceBeforeSquareBrackets: false
 Standard:        c++17
 TabWidth:        4
 UseTab:          Never
 WhitespaceSensitiveMacros: ['STRINGIZE']
 ...
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@ -26,7 +26,7 @@ COPY . .
 RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
        export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
    fi && \
-    cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
    cmake --build build --config Release -j$(nproc) && \
    cp build/bin/* .
--- a/.devops/full-musa.Dockerfile
+++ b/.devops/full-musa.Dockerfile
@ -19,7 +19,7 @@ WORKDIR /app
 COPY . .
-RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
    cmake --build build --config Release -j$(nproc) && \
    cp build/bin/* .
--- a/.devops/llama-cli-cann.Dockerfile
+++ b/.devops/llama-cli-cann.Dockerfile
@ -1,6 +1,6 @@
 ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8
-FROM cosdt/cann:$ASCEND_VERSION AS build
+FROM ascendai/cann:$ASCEND_VERSION AS build
 WORKDIR /app
@ -22,11 +22,11 @@ ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
 RUN echo "Building with static libs" && \
    source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
-    cmake -B build -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF  && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF  && \
    cmake --build build --config Release --target llama-cli
 # TODO: use image with NNRT
-FROM cosdt/cann:$ASCEND_VERSION AS runtime
+FROM ascendai/cann:$ASCEND_VERSION AS runtime
 COPY --from=build /app/build/bin/llama-cli /llama-cli
 ENV LC_ALL=C.utf8
--- a/.devops/llama-cli-cuda.Dockerfile
+++ b/.devops/llama-cli-cuda.Dockerfile
@ -22,16 +22,17 @@ COPY . .
 RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
        export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
    fi && \
-    cmake -B build -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
-    cmake --build build --config Release --target llama-cli -j$(nproc)
+    cmake --build build --config Release --target llama-cli -j$(nproc) && \
    mkdir -p /app/lib && \
    find build -name "*.so" -exec cp {} /app/lib \;
 FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
 RUN apt-get update && \
    apt-get install -y libgomp1
-COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
+COPY --from=build /app/lib/ /
-COPY --from=build /app/build/src/libllama.so /libllama.so
+COPY --from=build /app/build/bin/llama-cli /
 COPY --from=build /app/build/bin/llama-cli /llama-cli
 ENTRYPOINT [ "/llama-cli" ]
--- a/.devops/llama-cli-intel.Dockerfile
+++ b/.devops/llama-cli-intel.Dockerfile
@ -1,4 +1,4 @@
-ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
+ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
 FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
@ -15,7 +15,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
        export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
    fi && \
    echo "Building with static libs" && \
-    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \
    ${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \
    cmake --build build --config Release --target llama-cli
--- a/.devops/llama-cli-musa.Dockerfile
+++ b/.devops/llama-cli-musa.Dockerfile
@ -15,16 +15,17 @@ WORKDIR /app
 COPY . .
-RUN cmake -B build -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
-    cmake --build build --config Release --target llama-cli -j$(nproc)
+    cmake --build build --config Release --target llama-cli -j$(nproc) && \
    mkdir -p /app/lib && \
    find build -name "*.so" -exec cp {} /app/lib \;
 FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
 RUN apt-get update && \
    apt-get install -y libgomp1
-COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
+COPY --from=build /app/lib/ /
 COPY --from=build /app/build/src/libllama.so /libllama.so
 COPY --from=build /app/build/bin/llama-cli /llama-cli
 ENTRYPOINT [ "/llama-cli" ]
--- a/.devops/llama-cli-vulkan.Dockerfile
+++ b/.devops/llama-cli-vulkan.Dockerfile
@ -14,7 +14,7 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
 # Build it
 WORKDIR /app
 COPY . .
-RUN cmake -B build -DGGML_VULKAN=1 && \
+RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 && \
    cmake --build build --config Release --target llama-cli
 # Clean up
--- a/.devops/llama-server-cuda.Dockerfile
+++ b/.devops/llama-server-cuda.Dockerfile
@ -22,16 +22,17 @@ COPY . .
 RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
        export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
    fi && \
-    cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
-    cmake --build build --config Release --target llama-server -j$(nproc)
+    cmake --build build --config Release --target llama-server -j$(nproc) && \
    mkdir -p /app/lib && \
    find build -name "*.so" -exec cp {} /app/lib \;
 FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev libgomp1 curl
-COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
+COPY --from=build /app/lib/ /
 COPY --from=build /app/build/src/libllama.so /libllama.so
 COPY --from=build /app/build/bin/llama-server /llama-server
 # Must be set to 0.0.0.0 so it can listen to requests from host machine
--- a/.devops/llama-server-intel.Dockerfile
+++ b/.devops/llama-server-intel.Dockerfile
@ -1,4 +1,4 @@
-ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
+ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
 FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
@ -15,7 +15,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
        export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
    fi && \
    echo "Building with dynamic libs" && \
-    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
+    cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
    cmake --build build --config Release --target llama-server
 FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
--- a/.devops/llama-server-musa.Dockerfile
+++ b/.devops/llama-server-musa.Dockerfile
@ -15,16 +15,17 @@ WORKDIR /app
 COPY . .
-RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
+RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
-    cmake --build build --config Release --target llama-server -j$(nproc)
+    cmake --build build --config Release --target llama-server -j$(nproc) && \
    mkdir -p /app/lib && \
    find build -name "*.so" -exec cp {} /app/lib \;
 FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev libgomp1 curl
-COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
+COPY --from=build /app/lib/ /
 COPY --from=build /app/build/src/libllama.so /libllama.so
 COPY --from=build /app/build/bin/llama-server /llama-server
 # Must be set to 0.0.0.0 so it can listen to requests from host machine
--- a/.devops/llama-server-vulkan.Dockerfile
+++ b/.devops/llama-server-vulkan.Dockerfile
@ -14,7 +14,7 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
 # Build it
 WORKDIR /app
 COPY . .
-RUN cmake -B build -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
+RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
    cmake --build build --config Release --target llama-server
 # Clean up
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@ -126,9 +126,9 @@ effectiveStdenv.mkDerivation (finalAttrs: {
  };
  postPatch = ''
-    substituteInPlace ./ggml/src/ggml-metal.m \
+    substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \
      --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
-    substituteInPlace ./ggml/src/ggml-metal.m \
+    substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \
      --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
  '';
@ -173,7 +173,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
      (cmakeBool "GGML_NATIVE" false)
      (cmakeBool "GGML_BLAS" useBlas)
      (cmakeBool "GGML_CUDA" useCuda)
-      (cmakeBool "GGML_HIPBLAS" useRocm)
+      (cmakeBool "GGML_HIP" useRocm)
      (cmakeBool "GGML_METAL" useMetalKit)
      (cmakeBool "GGML_VULKAN" useVulkan)
      (cmakeBool "GGML_STATIC" enableStatic)
--- a/.github/ISSUE_TEMPLATE/01-bug-low.yml
+++ b/.github/ISSUE_TEMPLATE/01-bug-low.yml
@ -1,50 +0,0 @@
 name: Low Severity Bugs
 description: Used to report low severity bugs in llama.cpp (e.g. cosmetic issues, non critical UI glitches)
 title: "Bug: "
 labels: ["bug-unconfirmed", "low severity"]
 body:
  - type: markdown
    attributes:
      value: |
        Thanks for taking the time to fill out this bug report!
        Please include information about your system, the steps to reproduce the bug,
        and the version of llama.cpp that you are using.
        If possible, please provide a minimal code example that reproduces the bug.
  - type: textarea
    id: what-happened
    attributes:
      label: What happened?
      description: Also tell us, what did you expect to happen?
      placeholder: Tell us what you see!
    validations:
      required: true
  - type: textarea
    id: version
    attributes:
      label: Name and Version
      description: Which executable and which version of our software are you running? (use `--version` to get a version string)
      placeholder: |
        $./llama-cli --version
        version: 2999 (42b4109e)
        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
    validations:
      required: true
  - type: dropdown
    id: operating-system
    attributes:
      label: What operating system are you seeing the problem on?
      multiple: true
      options:
        - Linux
        - Mac
        - Windows
        - BSD
        - Other? (Please let us know in description)
    validations:
      required: false
  - type: textarea
    id: logs
    attributes:
      label: Relevant log output
      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
      render: shell
--- a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
+++ b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
@ -0,0 +1,77 @@
 name: Bug (compilation)
 description: Something goes wrong when trying to compile llama.cpp.
 title: "Compile bug: "
 labels: ["bug-unconfirmed", "compilation"]
 body:
  - type: markdown
    attributes:
      value: >
        Thanks for taking the time to fill out this bug report!
        This issue template is intended for bug reports where the compilation of llama.cpp fails.
        Before opening an issue, please confirm that the compilation still fails with `-DGGML_CCACHE=OFF`.
        If the compilation succeeds with ccache disabled you should be able to permanently fix the issue
        by clearing `~/.cache/ccache` (on Linux).
  - type: textarea
    id: commit
    attributes:
      label: Git commit
      description: Which commit are you trying to compile?
      placeholder: |
        $git rev-parse HEAD
        84a07a17b1b08cf2b9747c633a2372782848a27f
    validations:
      required: true
  - type: dropdown
    id: operating-system
    attributes:
      label: Operating systems
      description: Which operating systems do you know to be affected?
      multiple: true
      options:
        - Linux
        - Mac
        - Windows
        - BSD
        - Other? (Please let us know in description)
    validations:
      required: true
  - type: dropdown
    id: backends
    attributes:
        label: GGML backends
        description: Which GGML backends do you know to be affected?
        options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
        multiple: true
    validations:
      required: true
  - type: textarea
    id: info
    attributes:
      label: Problem description & steps to reproduce
      description: >
        Please give us a summary of the problem and tell us how to reproduce it.
        If you can narrow down the bug to specific compile flags, that information would be very much appreciated by us.
      placeholder: >
        I'm trying to compile llama.cpp with CUDA support on a fresh install of Ubuntu and get error XY.
        Here are the exact commands that I used: ...
    validations:
      required: true
  - type: textarea
    id: first_bad_commit
    attributes:
      label: First Bad Commit
      description: >
        If the bug was not present on an earlier version: when did it start appearing?
        If possible, please do a git bisect and identify the exact commit that introduced the bug.
    validations:
      required: false
  - type: textarea
    id: logs
    attributes:
      label: Relevant log output
      description: >
          Please copy and paste any relevant log output, including the command that you entered and any generated text.
          This will be automatically formatted into code, so no need for backticks.
      render: shell
    validations:
      required: true
--- a/.github/ISSUE_TEMPLATE/011-bug-results.yml
+++ b/.github/ISSUE_TEMPLATE/011-bug-results.yml
@ -0,0 +1,101 @@
 name: Bug (model use)
 description: Something goes wrong when using a model (in general, not specific to a single llama.cpp module).
 title: "Eval bug: "
 labels: ["bug-unconfirmed", "model evaluation"]
 body:
  - type: markdown
    attributes:
      value: >
        Thanks for taking the time to fill out this bug report!
        This issue template is intended for bug reports where the model evaluation results
        (i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation.
        If you encountered the issue while using an external UI (e.g. ollama),
        please reproduce your issue using one of the examples/binaries in this repository.
        The `llama-cli` binary can be used for simple and reproducible model inference.
  - type: textarea
    id: version
    attributes:
      label: Name and Version
      description: Which version of our software are you running? (use `--version` to get a version string)
      placeholder: |
        $./llama-cli --version
        version: 2999 (42b4109e)
        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
    validations:
      required: true
  - type: dropdown
    id: operating-system
    attributes:
      label: Operating systems
      description: Which operating systems do you know to be affected?
      multiple: true
      options:
        - Linux
        - Mac
        - Windows
        - BSD
        - Other? (Please let us know in description)
    validations:
      required: true
  - type: dropdown
    id: backends
    attributes:
        label: GGML backends
        description: Which GGML backends do you know to be affected?
        options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
        multiple: true
    validations:
      required: true
  - type: textarea
    id: hardware
    attributes:
      label: Hardware
      description: Which CPUs/GPUs are you using?
      placeholder: >
        e.g. Ryzen 5950X + 2x RTX 4090
    validations:
      required: true
  - type: textarea
    id: model
    attributes:
      label: Models
      description: >
        Which model(s) at which quantization were you using when encountering the bug?
        If you downloaded a GGUF file off of Huggingface, please provide a link.
      placeholder: >
        e.g. Meta LLaMA 3.1 Instruct 8b q4_K_M
    validations:
      required: false
  - type: textarea
    id: info
    attributes:
      label: Problem description & steps to reproduce
      description: >
        Please give us a summary of the problem and tell us how to reproduce it.
        If you can narrow down the bug to specific hardware, compile flags, or command line arguments,
        that information would be very much appreciated by us.
      placeholder: >
        e.g. when I run llama-cli with -ngl 99 I get garbled outputs.
        When I use -ngl 0 it works correctly.
        Here are the exact commands that I used: ...
    validations:
      required: true
  - type: textarea
    id: first_bad_commit
    attributes:
      label: First Bad Commit
      description: >
        If the bug was not present on an earlier version: when did it start appearing?
        If possible, please do a git bisect and identify the exact commit that introduced the bug.
    validations:
      required: false
  - type: textarea
    id: logs
    attributes:
      label: Relevant log output
      description: >
          Please copy and paste any relevant log output, including the command that you entered and any generated text.
          This will be automatically formatted into code, so no need for backticks.
      render: shell
    validations:
      required: true
--- a/.github/ISSUE_TEMPLATE/019-bug-misc.yml
+++ b/.github/ISSUE_TEMPLATE/019-bug-misc.yml
@ -0,0 +1,81 @@
 name: Bug (misc.)
 description: Something is not working the way it should (and it's not covered by any of the above cases).
 title: "Misc. bug: "
 labels: ["bug-unconfirmed"]
 body:
  - type: markdown
    attributes:
      value: >
        Thanks for taking the time to fill out this bug report!
        This issue template is intended for miscellaneous bugs that don't fit into any other category.
        If you encountered the issue while using an external UI (e.g. ollama),
        please reproduce your issue using one of the examples/binaries in this repository.
  - type: textarea
    id: version
    attributes:
      label: Name and Version
      description: Which version of our software is affected? (You can use `--version` to get a version string.)
      placeholder: |
        $./llama-cli --version
        version: 2999 (42b4109e)
        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
    validations:
      required: true
  - type: dropdown
    id: operating-system
    attributes:
      label: Operating systems
      description: Which operating systems do you know to be affected?
      multiple: true
      options:
        - Linux
        - Mac
        - Windows
        - BSD
        - Other? (Please let us know in description)
    validations:
      required: false
  - type: dropdown
    id: module
    attributes:
      label: Which llama.cpp modules do you know to be affected?
      multiple: true
      options:
        - Documentation/Github
        - libllama (core library)
        - llama-cli
        - llama-server
        - llama-bench
        - llama-quantize
        - Python/Bash scripts
        - Test code
        - Other (Please specify in the next section)
    validations:
      required: false
  - type: textarea
    id: info
    attributes:
      label: Problem description & steps to reproduce
      description: >
        Please give us a summary of the problem and tell us how to reproduce it (if applicable).
    validations:
      required: true
  - type: textarea
    id: first_bad_commit
    attributes:
      label: First Bad Commit
      description: >
        If the bug was not present on an earlier version and it's not trivial to track down: when did it start appearing?
        If possible, please do a git bisect and identify the exact commit that introduced the bug.
    validations:
      required: false
  - type: textarea
    id: logs
    attributes:
      label: Relevant log output
      description: >
          If applicable, please copy and paste any relevant log output, including the command that you entered and any generated text.
          This will be automatically formatted into code, so no need for backticks.
      render: shell
    validations:
      required: false
--- a/.github/ISSUE_TEMPLATE/02-bug-medium.yml
+++ b/.github/ISSUE_TEMPLATE/02-bug-medium.yml
@ -1,50 +0,0 @@
 name: Medium Severity Bug
 description: Used to report medium severity bugs in llama.cpp (e.g. Malfunctioning Features but generally still useable)
 title: "Bug: "
 labels: ["bug-unconfirmed", "medium severity"]
 body:
  - type: markdown
    attributes:
      value: |
        Thanks for taking the time to fill out this bug report!
        Please include information about your system, the steps to reproduce the bug,
        and the version of llama.cpp that you are using.
        If possible, please provide a minimal code example that reproduces the bug.
  - type: textarea
    id: what-happened
    attributes:
      label: What happened?
      description: Also tell us, what did you expect to happen?
      placeholder: Tell us what you see!
    validations:
      required: true
  - type: textarea
    id: version
    attributes:
      label: Name and Version
      description: Which executable and which version of our software are you running? (use `--version` to get a version string)
      placeholder: |
        $./llama-cli --version
        version: 2999 (42b4109e)
        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
    validations:
      required: true
  - type: dropdown
    id: operating-system
    attributes:
      label: What operating system are you seeing the problem on?
      multiple: true
      options:
        - Linux
        - Mac
        - Windows
        - BSD
        - Other? (Please let us know in description)
    validations:
      required: false
  - type: textarea
    id: logs
    attributes:
      label: Relevant log output
      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
      render: shell
--- a/.github/ISSUE_TEMPLATE/020-enhancement.yml
+++ b/.github/ISSUE_TEMPLATE/020-enhancement.yml
@ -1,5 +1,5 @@
 name: Enhancement
-description: Used to request enhancements for llama.cpp
+description: Used to request enhancements for llama.cpp.
 title: "Feature Request: "
 labels: ["enhancement"]
 body:
--- a/.github/ISSUE_TEMPLATE/03-bug-high.yml
+++ b/.github/ISSUE_TEMPLATE/03-bug-high.yml
@ -1,50 +0,0 @@
 name: High Severity Bug
 description: Used to report high severity bugs in llama.cpp (e.g. Malfunctioning features hindering important common workflow)
 title: "Bug: "
 labels: ["bug-unconfirmed", "high severity"]
 body:
  - type: markdown
    attributes:
      value: |
        Thanks for taking the time to fill out this bug report!
        Please include information about your system, the steps to reproduce the bug,
        and the version of llama.cpp that you are using.
        If possible, please provide a minimal code example that reproduces the bug.
  - type: textarea
    id: what-happened
    attributes:
      label: What happened?
      description: Also tell us, what did you expect to happen?
      placeholder: Tell us what you see!
    validations:
      required: true
  - type: textarea
    id: version
    attributes:
      label: Name and Version
      description: Which executable and which version of our software are you running? (use `--version` to get a version string)
      placeholder: |
        $./llama-cli --version
        version: 2999 (42b4109e)
        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
    validations:
      required: true
  - type: dropdown
    id: operating-system
    attributes:
      label: What operating system are you seeing the problem on?
      multiple: true
      options:
        - Linux
        - Mac
        - Windows
        - BSD
        - Other? (Please let us know in description)
    validations:
      required: false
  - type: textarea
    id: logs
    attributes:
      label: Relevant log output
      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
      render: shell
--- a/.github/ISSUE_TEMPLATE/030-research.yml
+++ b/.github/ISSUE_TEMPLATE/030-research.yml
@ -1,5 +1,5 @@
 name: Research
-description: Track new technical research area
+description: Track new technical research area.
 title: "Research: "
 labels: ["research 🔬"]
 body:
--- a/.github/ISSUE_TEMPLATE/04-bug-critical.yml
+++ b/.github/ISSUE_TEMPLATE/04-bug-critical.yml
@ -1,50 +0,0 @@
 name: Critical Severity Bug
 description: Used to report critical severity bugs in llama.cpp (e.g. Crashing, Corrupted, Dataloss)
 title: "Bug: "
 labels: ["bug-unconfirmed", "critical severity"]
 body:
  - type: markdown
    attributes:
      value: |
        Thanks for taking the time to fill out this bug report!
        Please include information about your system, the steps to reproduce the bug,
        and the version of llama.cpp that you are using.
        If possible, please provide a minimal code example that reproduces the bug.
  - type: textarea
    id: what-happened
    attributes:
      label: What happened?
      description: Also tell us, what did you expect to happen?
      placeholder: Tell us what you see!
    validations:
      required: true
  - type: textarea
    id: version
    attributes:
      label: Name and Version
      description: Which executable and which version of our software are you running? (use `--version` to get a version string)
      placeholder: |
        $./llama-cli --version
        version: 2999 (42b4109e)
        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
    validations:
      required: true
  - type: dropdown
    id: operating-system
    attributes:
      label: What operating system are you seeing the problem on?
      multiple: true
      options:
        - Linux
        - Mac
        - Windows
        - BSD
        - Other? (Please let us know in description)
    validations:
      required: false
  - type: textarea
    id: logs
    attributes:
      label: Relevant log output
      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
      render: shell
--- a/.github/ISSUE_TEMPLATE/040-refactor.yml
+++ b/.github/ISSUE_TEMPLATE/040-refactor.yml
@ -1,5 +1,5 @@
 name: Refactor (Maintainers)
-description: Used to track refactoring opportunities
+description: Used to track refactoring opportunities.
 title: "Refactor: "
 labels: ["refactor"]
 body:
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -55,7 +55,13 @@ jobs:
          sysctl -a
          mkdir build
          cd build
-          cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF ..
+          cmake .. \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DLLAMA_CURL=ON \
            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=ON \
            -DGGML_RPC=ON \
            -DBUILD_SHARED_LIBS=OFF
          cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
      - name: Test
@ -113,7 +119,12 @@ jobs:
          sysctl -a
          # Metal is disabled due to intermittent failures with Github runners not having a GPU:
          # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
-          cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
+          cmake -B build \
            -DLLAMA_FATAL_WARNINGS=ON \
            -DLLAMA_CURL=ON \
            -DGGML_METAL=OFF \
            -DGGML_RPC=ON \
            -DBUILD_SHARED_LIBS=OFF
          cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
      - name: Test
@ -394,15 +405,36 @@ jobs:
      - name: Build with native CMake HIP support
        id: cmake_build
        run: |
-          cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIPBLAS=ON
+          cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIP=ON
          cmake --build build --config Release -j $(nproc)
      - name: Build with legacy HIP support
        id: cmake_build_legacy_hip
        run: |
-          cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIPBLAS=ON
+          cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIP=ON
          cmake --build build2 --config Release -j $(nproc)
  ubuntu-22-cmake-musa:
    runs-on: ubuntu-22.04
    container: mthreads/musa:rc3.1.0-devel-ubuntu22.04
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
      - name: Dependencies
        id: depends
        run: |
          apt-get update
          apt-get install -y build-essential git cmake libcurl4-openssl-dev
      - name: Build with native CMake MUSA support
        id: cmake_build
        run: |
          cmake -B build -S . -DGGML_MUSA=ON
          cmake --build build --config Release -j $(nproc)
  ubuntu-22-cmake-sycl:
    runs-on: ubuntu-22.04
@ -569,6 +601,7 @@ jobs:
          mkdir build
          cd build
          cmake -G Xcode .. \
            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=ON \
            -DLLAMA_BUILD_EXAMPLES=OFF \
            -DLLAMA_BUILD_TESTS=OFF \
@ -599,6 +632,7 @@ jobs:
          mkdir build
          cd build
          cmake -G Xcode .. \
            -DGGML_METAL_USE_BF16=ON \
            -DGGML_METAL_EMBED_LIBRARY=ON \
            -DLLAMA_BUILD_EXAMPLES=OFF \
            -DLLAMA_BUILD_TESTS=OFF \
@ -734,7 +768,7 @@ jobs:
        id: clone_kompute
        if: ${{ matrix.build == 'kompute-x64' }}
        run: |
-          git submodule update --init ggml/src/kompute
+          git submodule update --init ggml/src/ggml-kompute/kompute
      - name: Download OpenBLAS
        id: get_openblas
@ -917,8 +951,8 @@ jobs:
        shell: bash
    env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7dff44ba-e3af-4448-841c-0d616c8da6e7/w_BaseKit_p_2024.1.0.595_offline.exe
+      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe
-      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel
+      WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel
      ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
    steps:
      - name: Clone
@ -928,7 +962,8 @@ jobs:
          fetch-depth: 0
      - name: Install
-        run:  scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
+        run:  |
          scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL
      - name: Build
        id: cmake_build
@ -947,26 +982,34 @@ jobs:
            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
          fi
-      - name: Pack artifacts
+      - name: Build the release package
        id: pack_artifacts
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        if: ${{ ( github.event_name == 'pull_request' && github.base_ref == 'master' ) }}
        run: |
          echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
-          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.4.dll" ./build/bin
+
          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_win_proxy_loader.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_level_zero.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin
-          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl7.dll" ./build/bin
+          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin
          cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin
          echo "cp oneAPI running time dll files to ./build/bin done"
          7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
-      - name: Upload artifacts
+      - name: Upload the release package
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        if: ${{ ( github.event_name == 'pull_request' && github.base_ref == 'master' ) }}
        uses: actions/upload-artifact@v4
        with:
          path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
@ -1001,7 +1044,7 @@ jobs:
        run: |
          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-          cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_RPC=ON
+          cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DGGML_RPC=ON
          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
  windows-latest-cmake-hip-release:
@ -1037,7 +1080,7 @@ jobs:
        run: |
          $env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
          $env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
-          cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON
+          cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIP=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS=${{ matrix.gpu_target }} -DGGML_RPC=ON
          cmake --build build -j ${env:NUMBER_OF_PROCESSORS}
          md "build\bin\rocblas\library\"
          cp "${env:HIP_PATH}\bin\hipblas.dll" "build\bin\"
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -10,12 +10,10 @@
 name: Publish Docker image
 on:
-  #pull_request:
+  workflow_dispatch: # allows manual triggering
-  push:
+  schedule:
-    branches:
+    # Rebuild daily rather than on every push because it is expensive
-      - master
+    - cron: '12 4 * * *'
    paths: ['.github/workflows/docker.yml', '.devops/*.Dockerfile', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
  workflow_dispatch: # allows manual triggering, useful for debugging
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@ -29,7 +27,6 @@ permissions:
 jobs:
  push_to_registry:
    name: Push Docker image to Docker Hub
    #if: github.event.pull_request.draft == false
    runs-on: ubuntu-latest
    env:
--- a/.gitignore
+++ b/.gitignore
@ -3,6 +3,7 @@
 *.a
 *.bat
 *.bin
 *.d
 *.dll
 *.dot
 *.etag
@ -133,3 +134,7 @@ poetry.toml
 # Test models for lora adapters
 /lora-tests
 # Local scripts
 /run-vim.sh
 /run-chat.sh
--- a/.gitmodules
+++ b/.gitmodules
@ -1,3 +1,3 @@
 [submodule "kompute"]
-	path = ggml/src/kompute
+	path = ggml/src/ggml-kompute/kompute
 	url = https://github.com/nomic-ai/kompute.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -46,6 +46,13 @@ if (WIN32)
    add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
 endif()
 if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
    add_compile_options("$<$<COMPILE_LANGUAGE:C>:/source-charset:utf-8>")
    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/source-charset:utf-8>")
    add_compile_options("$<$<COMPILE_LANGUAGE:C>:/execution-charset:utf-8>")
    add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:/execution-charset:utf-8>")
 endif()
 #
 # option list
 #
@ -140,7 +147,6 @@ set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location o
 set(LLAMA_LIB_INSTALL_DIR     ${CMAKE_INSTALL_LIBDIR}     CACHE PATH "Location of library files")
 set(LLAMA_BIN_INSTALL_DIR     ${CMAKE_INSTALL_BINDIR}     CACHE PATH "Location of binary  files")
 # At the moment some compile definitions are placed within the ggml/src
 # directory but not exported on the `ggml` target. This could be improved by
 # determining _precisely_ which defines are necessary for the llama-config
@ -157,8 +163,11 @@ if (GGML_TARGET_DEFINES)
    list(APPEND GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES})
 endif()
 get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)
-
+# all public headers
-set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h)
+set(LLAMA_PUBLIC_HEADERS
    ${CMAKE_CURRENT_SOURCE_DIR}/include/llama.h
    ${CMAKE_CURRENT_SOURCE_DIR}/include/llama-cpp.h)
 set_target_properties(llama PROPERTIES PUBLIC_HEADER "${LLAMA_PUBLIC_HEADERS}")
 install(TARGETS llama LIBRARY PUBLIC_HEADER)
 configure_package_config_file(
--- a/CMakePresets.json
+++ b/CMakePresets.json
@ -24,11 +24,12 @@
            "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
        }
    },
-    { "name": "debug",   "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
+    { "name": "debug",    "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
-    { "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
+    { "name": "release",  "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
-    { "name": "reldbg",  "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
+    { "name": "reldbg",   "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
-    { "name": "static",  "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
+    { "name": "static",   "hidden": true, "cacheVariables": { "GGML_STATIC":      "ON" } },
-    { "name": "sycl_f16",  "hidden": true, "cacheVariables": { "GGML_SYCL_F16": "ON" } },
+    { "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16":    "ON" } },
    { "name": "vulkan",   "hidden": true, "cacheVariables": { "GGML_VULKAN":      "ON" } },
    {
        "name": "arm64-windows-msvc", "hidden": true,
@ -57,25 +58,28 @@
        }
    },
-    { "name": "arm64-windows-llvm-debug"  , "inherits": [ "base", "arm64-windows-llvm",  "debug"   ] },
+    { "name": "arm64-windows-llvm-debug", "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
-    { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm",  "reldbg" ] },
+    { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
-    { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm",  "reldbg", "static" ] },
+    { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg", "static" ] },
-    { "name": "arm64-apple-clang-debug"  , "inherits": [ "base", "arm64-apple-clang",  "debug"   ] },
+    { "name": "arm64-apple-clang-debug", "inherits": [ "base", "arm64-apple-clang", "debug" ] },
-    { "name": "arm64-apple-clang-release"  , "inherits": [ "base", "arm64-apple-clang",  "reldbg"   ] },
+    { "name": "arm64-apple-clang-release", "inherits": [ "base", "arm64-apple-clang", "reldbg" ] },
-    { "name": "arm64-apple-clang+static-release"  , "inherits": [ "base", "arm64-apple-clang",  "reldbg", "static" ] },
+    { "name": "arm64-apple-clang+static-release", "inherits": [ "base", "arm64-apple-clang",  "reldbg", "static" ] },
-    { "name": "arm64-windows-msvc-debug"  , "inherits": [ "base", "arm64-windows-msvc",  "debug"   ] },
+    { "name": "arm64-windows-msvc-debug", "inherits": [ "base", "arm64-windows-msvc",  "debug"   ] },
    { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg" ] },
    { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg", "static" ] },
-    { "name": "x64-windows-msvc-debug"  , "inherits": [ "base", "debug"   ] },
+    { "name": "x64-windows-msvc-debug", "inherits": [ "base", "debug" ] },
    { "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
    { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
-    { "name": "x64-windows-sycl-debug"  , "inherits": [ "sycl-base", "debug"   ] },
+    { "name": "x64-windows-sycl-debug", "inherits": [ "sycl-base", "debug" ] },
    { "name": "x64-windows-sycl-debug-f16", "inherits": [ "sycl-base", "debug", "sycl_f16" ] },
    { "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] },
-    { "name": "x64-windows-sycl-release-f16", "inherits": [ "sycl-base", "release", "sycl_f16" ] }
+    { "name": "x64-windows-sycl-release-f16", "inherits": [ "sycl-base", "release", "sycl_f16" ] },
    { "name": "x64-windows-vulkan-debug", "inherits": [ "base", "vulkan", "debug" ] },
    { "name": "x64-windows-vulkan-release", "inherits": [ "base", "vulkan", "release" ] }
  ]
 }
--- a/609
+++ b/609
@ -34,6 +34,7 @@ BUILD_TARGETS = \
 	llama-server \
 	llama-simple \
 	llama-simple-chat \
 	llama-run \
 	llama-speculative \
 	llama-tokenize \
 	llama-vdot \
@ -48,7 +49,6 @@ TEST_TARGETS = \
 	tests/test-backend-ops \
 	tests/test-chat-template \
 	tests/test-double-float \
 	tests/test-grad0 \
 	tests/test-grammar-integration \
 	tests/test-grammar-parser \
 	tests/test-json-schema-to-grammar \
@ -252,7 +252,7 @@ endif
 #
 # keep standard at C11 and C++11
-MK_CPPFLAGS  = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon
+MK_CPPFLAGS  = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -DGGML_USE_CPU
 MK_CFLAGS    = -std=c11   -fPIC
 MK_CXXFLAGS  = -std=c++11 -fPIC
 MK_NVCCFLAGS = -std=c++11
@ -291,6 +291,7 @@ endif
 # some memory allocation are available on Linux through GNU extensions in libc
 ifeq ($(UNAME_S),Linux)
 	MK_CPPFLAGS += -D_GNU_SOURCE
 	MK_LDFLAGS  += -ldl
 endif
 # RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
@ -359,6 +360,10 @@ ifdef LLAMA_SERVER_SSL
 	MK_LDFLAGS += -lssl -lcrypto
 endif
 ifndef GGML_NO_CPU_AARCH64
 	MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64
 endif
 # warnings
 WARN_FLAGS = \
 	-Wall \
@ -523,70 +528,59 @@ ifndef GGML_NO_ACCELERATE
 	# Mac OS - include Accelerate framework.
 	# `-framework Accelerate` works both with Apple Silicon and Mac Intel
 	ifeq ($(UNAME_S),Darwin)
-		MK_CPPFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS
+		MK_CPPFLAGS  += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS -DGGML_BLAS_USE_ACCELERATE
-		MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK
+		MK_CPPFLAGS  += -DACCELERATE_NEW_LAPACK
-		MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64
+		MK_CPPFLAGS  += -DACCELERATE_LAPACK_ILP64
-		MK_LDFLAGS  += -framework Accelerate
+		MK_LDFLAGS   += -framework Accelerate
-		OBJ_GGML    += ggml/src/ggml-blas.o
+		OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o
 	endif
 endif # GGML_NO_ACCELERATE
 ifdef GGML_MUSA
 	CC := clang
 	CXX := clang++
 	GGML_CUDA := 1
 	MK_CPPFLAGS += -DGGML_USE_MUSA
 endif
 ifndef GGML_NO_OPENMP
 	MK_CPPFLAGS += -DGGML_USE_OPENMP
 	MK_CFLAGS   += -fopenmp
 	MK_CXXFLAGS += -fopenmp
 	ifdef GGML_MUSA
 		MK_CPPFLAGS += -I/usr/lib/llvm-10/include/openmp
 		MK_LDFLAGS  += -L/usr/lib/llvm-10/lib
 	endif # GGML_MUSA
 endif # GGML_NO_OPENMP
 ifdef GGML_OPENBLAS
-	MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas)
+	MK_CPPFLAGS  += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas)
-	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas)
+	MK_CFLAGS    += $(shell pkg-config --cflags-only-other openblas)
-	MK_LDFLAGS  += $(shell pkg-config --libs openblas)
+	MK_LDFLAGS   += $(shell pkg-config --libs openblas)
-	OBJ_GGML    += ggml/src/ggml-blas.o
+	OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o
 endif # GGML_OPENBLAS
 ifdef GGML_OPENBLAS64
-	MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64)
+	MK_CPPFLAGS  += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64)
-	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas64)
+	MK_CFLAGS    += $(shell pkg-config --cflags-only-other openblas64)
-	MK_LDFLAGS  += $(shell pkg-config --libs openblas64)
+	MK_LDFLAGS   += $(shell pkg-config --libs openblas64)
-	OBJ_GGML    += ggml/src/ggml-blas.o
+	OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o
 endif # GGML_OPENBLAS64
 ifdef GGML_BLIS
-	MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_BLIS -I/usr/local/include/blis -I/usr/include/blis
+	MK_CPPFLAGS  += -DGGML_USE_BLAS -DGGML_BLAS_USE_BLIS -I/usr/local/include/blis -I/usr/include/blis
-	MK_LDFLAGS  += -lblis -L/usr/local/lib
+	MK_LDFLAGS   += -lblis -L/usr/local/lib
-	OBJ_GGML    += ggml/src/ggml-blas.o
+	OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o
 endif # GGML_BLIS
 ifdef GGML_NVPL
-	MK_CPPFLAGS += -DGGML_USE_BLAS -DGGML_BLAS_USE_NVPL -DNVPL_ILP64 -I/usr/local/include/nvpl_blas -I/usr/include/nvpl_blas
+	MK_CPPFLAGS  += -DGGML_USE_BLAS -DGGML_BLAS_USE_NVPL -DNVPL_ILP64 -I/usr/local/include/nvpl_blas -I/usr/include/nvpl_blas
-	MK_LDFLAGS  += -L/usr/local/lib -lnvpl_blas_core -lnvpl_blas_ilp64_gomp
+	MK_LDFLAGS   += -L/usr/local/lib -lnvpl_blas_core -lnvpl_blas_ilp64_gomp
-	OBJ_GGML    += ggml/src/ggml-blas.o
+	OBJ_GGML_EXT += ggml/src/ggml-blas/ggml-blas.o
 endif # GGML_NVPL
 ifndef GGML_NO_LLAMAFILE
-	MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
+	MK_CPPFLAGS  += -DGGML_USE_LLAMAFILE
-	OBJ_GGML    += ggml/src/llamafile/sgemm.o
+	OBJ_GGML_EXT += ggml/src/ggml-cpu/llamafile/sgemm.o
 endif
 ifndef GGML_NO_AMX
 	MK_CPPFLAGS += -DGGML_USE_AMX
-	OBJ_GGML    += ggml/src/ggml-amx.o ggml/src/ggml-amx/mmq.o
+	OBJ_GGML_EXT += ggml/src/ggml-amx/ggml-amx.o ggml/src/ggml-amx/mmq.o
 endif
 ifdef GGML_RPC
-	MK_CPPFLAGS += -DGGML_USE_RPC
+	MK_CPPFLAGS  += -DGGML_USE_RPC
-	OBJ_GGML    += ggml/src/ggml-rpc.o
+	OBJ_GGML_EXT += ggml/src/ggml-rpc.o
 endif # GGML_RPC
 OBJ_CUDA_TMPL      = $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-wmma*.cu))
@ -601,41 +595,27 @@ else
 endif # GGML_CUDA_FA_ALL_QUANTS
 ifdef GGML_CUDA
-	ifdef GGML_MUSA
+	ifneq ('', '$(wildcard /opt/cuda)')
-		ifneq ('', '$(wildcard /opt/musa)')
+		CUDA_PATH ?= /opt/cuda
 			CUDA_PATH ?= /opt/musa
 		else
 			CUDA_PATH ?= /usr/local/musa
 		endif
 		MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include
 		MK_LDFLAGS   += -lmusa -lmublas -lmusart -lpthread -ldl -lrt -L$(CUDA_PATH)/lib -L/usr/lib64
 		MK_NVCCFLAGS += -x musa -mtgpu --cuda-gpu-arch=mp_21 --cuda-gpu-arch=mp_22
 	else
-		ifneq ('', '$(wildcard /opt/cuda)')
+		CUDA_PATH ?= /usr/local/cuda
-			CUDA_PATH ?= /opt/cuda
+	endif
 		else
 			CUDA_PATH ?= /usr/local/cuda
 		endif
-		MK_CPPFLAGS  += -DGGML_USE_CUDA -DGGML_CUDA_USE_GRAPHS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
+	MK_CPPFLAGS  += -DGGML_USE_CUDA -DGGML_CUDA_USE_GRAPHS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
-		MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
+	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
-		MK_NVCCFLAGS += -use_fast_math
+	MK_NVCCFLAGS += -use_fast_math
 	endif # GGML_MUSA
-	OBJ_GGML += ggml/src/ggml-cuda.o
+	OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o
-	OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
+	OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
-	OBJ_GGML += $(OBJ_CUDA_TMPL)
+	OBJ_GGML_EXT += $(OBJ_CUDA_TMPL)
 ifdef LLAMA_FATAL_WARNINGS
 	MK_NVCCFLAGS += -Werror all-warnings
 endif # LLAMA_FATAL_WARNINGS
 ifndef GGML_MUSA
 ifndef JETSON_EOL_MODULE_DETECT
 	MK_NVCCFLAGS += --forward-unknown-to-host-compiler
 endif # JETSON_EOL_MODULE_DETECT
 endif # GGML_MUSA
 ifdef LLAMA_DEBUG
 	MK_NVCCFLAGS += -lineinfo
@ -648,11 +628,7 @@ endif # GGML_CUDA_DEBUG
 ifdef GGML_CUDA_NVCC
 	NVCC = $(CCACHE) $(GGML_CUDA_NVCC)
 else
-	ifdef GGML_MUSA
+	NVCC = $(CCACHE) nvcc
 		NVCC = $(CCACHE) mcc
 	else
 		NVCC = $(CCACHE) nvcc
 	endif # GGML_MUSA
 endif # GGML_CUDA_NVCC
 ifdef CUDA_DOCKER_ARCH
@ -661,10 +637,6 @@ else ifndef CUDA_POWER_ARCH
 	MK_NVCCFLAGS += -arch=native
 endif # CUDA_DOCKER_ARCH
 ifdef GGML_CUDA_FORCE_DMMV
 	MK_NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
 endif # GGML_CUDA_FORCE_DMMV
 ifdef GGML_CUDA_FORCE_MMQ
 	MK_NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
 endif # GGML_CUDA_FORCE_MMQ
@ -673,20 +645,6 @@ ifdef GGML_CUDA_FORCE_CUBLAS
 	MK_NVCCFLAGS += -DGGML_CUDA_FORCE_CUBLAS
 endif # GGML_CUDA_FORCE_CUBLAS
 ifdef GGML_CUDA_DMMV_X
 	MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(GGML_CUDA_DMMV_X)
 else
 	MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
 endif # GGML_CUDA_DMMV_X
 ifdef GGML_CUDA_MMV_Y
 	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_MMV_Y)
 else ifdef GGML_CUDA_DMMV_Y
 	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_DMMV_Y) # for backwards compatibility
 else
 	MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
 endif # GGML_CUDA_MMV_Y
 ifdef GGML_CUDA_F16
 	MK_NVCCFLAGS += -DGGML_CUDA_F16
 endif # GGML_CUDA_F16
@ -695,12 +653,6 @@ ifdef GGML_CUDA_DMMV_F16
 	MK_NVCCFLAGS += -DGGML_CUDA_F16
 endif # GGML_CUDA_DMMV_F16
 ifdef GGML_CUDA_KQUANTS_ITER
 	MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER)
 else
 	MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
 endif
 ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
 	MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(GGML_CUDA_PEER_MAX_BATCH_SIZE)
 else
@ -724,15 +676,9 @@ define NVCC_COMPILE
 	$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
 endef # NVCC_COMPILE
 else
 	ifdef GGML_MUSA
 define NVCC_COMPILE
 	$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -c $< -o $@
 endef # NVCC_COMPILE
 	else
 define NVCC_COMPILE
 	$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
 endef # NVCC_COMPILE
 	endif # GGML_MUSA
 endif # JETSON_EOL_MODULE_DETECT
 ggml/src/ggml-cuda/%.o: \
@ -742,8 +688,8 @@ ggml/src/ggml-cuda/%.o: \
 	ggml/src/ggml-cuda/common.cuh
 	$(NVCC_COMPILE)
-ggml/src/ggml-cuda.o: \
+ggml/src/ggml-cuda/ggml-cuda.o: \
-	ggml/src/ggml-cuda.cu \
+	ggml/src/ggml-cuda/ggml-cuda.cu \
 	ggml/include/ggml-cuda.h \
 	ggml/include/ggml.h \
 	ggml/include/ggml-backend.h \
@ -754,9 +700,9 @@ ggml/src/ggml-cuda.o: \
 endif # GGML_CUDA
 ifdef GGML_VULKAN
-	MK_CPPFLAGS += -DGGML_USE_VULKAN
+	MK_CPPFLAGS  += -DGGML_USE_VULKAN
-	MK_LDFLAGS  += $(shell pkg-config --libs vulkan)
+	MK_LDFLAGS   += $(shell pkg-config --libs vulkan)
-	OBJ_GGML    += ggml/src/ggml-vulkan.o ggml/src/ggml-vulkan-shaders.o
+	OBJ_GGML_EXT += ggml/src/ggml-vulkan.o ggml/src/ggml-vulkan-shaders.o
 ifdef GGML_VULKAN_CHECK_RESULTS
 	MK_CPPFLAGS  += -DGGML_VULKAN_CHECK_RESULTS
@ -786,10 +732,10 @@ GLSLC_CMD  = glslc
 _ggml_vk_genshaders_cmd = $(shell pwd)/vulkan-shaders-gen
 _ggml_vk_header = ggml/src/ggml-vulkan-shaders.hpp
 _ggml_vk_source = ggml/src/ggml-vulkan-shaders.cpp
-_ggml_vk_input_dir = ggml/src/vulkan-shaders
+_ggml_vk_input_dir = ggml/src/ggml-vulkan/vulkan-shaders
 _ggml_vk_shader_deps = $(echo $(_ggml_vk_input_dir)/*.comp)
-ggml/src/ggml-vulkan.o: ggml/src/ggml-vulkan.cpp ggml/include/ggml-vulkan.h $(_ggml_vk_header) $(_ggml_vk_source)
+ggml/src/ggml-vulkan.o: ggml/src/ggml-vulkan/ggml-vulkan.cpp ggml/include/ggml-vulkan.h $(_ggml_vk_header) $(_ggml_vk_source)
 	$(CXX) $(CXXFLAGS) $(shell pkg-config --cflags vulkan) -c $< -o $@
 $(_ggml_vk_header): $(_ggml_vk_source)
@ -801,8 +747,8 @@ $(_ggml_vk_source): $(_ggml_vk_shader_deps) vulkan-shaders-gen
 		--target-hpp $(_ggml_vk_header) \
 		--target-cpp $(_ggml_vk_source)
-vulkan-shaders-gen: ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
+vulkan-shaders-gen: ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
-	$(CXX) $(CXXFLAGS) -o $@ $(LDFLAGS) ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
+	$(CXX) $(CXXFLAGS) -o $@ $(LDFLAGS) ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
 endif # GGML_VULKAN
@ -815,11 +761,7 @@ ifdef GGML_HIPBLAS
 		AMDGPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
 	endif
-	GGML_CUDA_DMMV_X       ?= 32
+	MK_CPPFLAGS += -DGGML_USE_HIP -DGGML_USE_CUDA
 	GGML_CUDA_MMV_Y        ?= 1
 	GGML_CUDA_KQUANTS_ITER ?= 2
 	MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
 ifdef GGML_HIP_UMA
 	MK_CPPFLAGS += -DGGML_HIP_UMA
@ -832,13 +774,6 @@ endif # GGML_HIP_UMA
 	HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc
 	HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
 	HIPFLAGS += -DGGML_CUDA_DMMV_X=$(GGML_CUDA_DMMV_X)
 	HIPFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_MMV_Y)
 	HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER)
 ifdef GGML_CUDA_FORCE_DMMV
 	HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
 endif # GGML_CUDA_FORCE_DMMV
 ifdef GGML_CUDA_FORCE_MMQ
 	HIPFLAGS += -DGGML_CUDA_FORCE_MMQ
@ -852,12 +787,12 @@ ifdef GGML_CUDA_NO_PEER_COPY
 	HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
 endif # GGML_CUDA_NO_PEER_COPY
-	OBJ_GGML += ggml/src/ggml-cuda.o
+	OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o
-	OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
+	OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
-	OBJ_GGML += $(OBJ_CUDA_TMPL)
+	OBJ_GGML_EXT += $(OBJ_CUDA_TMPL)
-ggml/src/ggml-cuda.o: \
+ggml/src/ggml-cuda/ggml-cuda.o: \
-	ggml/src/ggml-cuda.cu \
+	ggml/src/ggml-cuda/ggml-cuda.cu \
 	ggml/include/ggml-cuda.h \
 	ggml/include/ggml.h \
 	ggml/include/ggml-backend.h \
@ -874,70 +809,168 @@ ggml/src/ggml-cuda/%.o: \
 	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
 endif # GGML_HIPBLAS
 ifdef GGML_MUSA
 	ifeq ($(wildcard /opt/musa),)
 		MUSA_PATH ?= /usr/local/musa
 	else
 		MUSA_PATH ?= /opt/musa
 	endif
 	MTGPU_TARGETS ?= mp_21 mp_22
 	MK_CPPFLAGS += -DGGML_USE_MUSA -DGGML_USE_CUDA
 	MK_LDFLAGS += -L$(MUSA_PATH)/lib -Wl,-rpath=$(MUSA_PATH)/lib
 	MK_LDFLAGS += -lmusa -lmusart -lmublas
 	ifndef GGML_NO_OPENMP
 		# For Ubuntu Focal
 		MK_CPPFLAGS += -I/usr/lib/llvm-10/include/openmp
 		MK_LDFLAGS  += -L/usr/lib/llvm-10/lib
 		# For Ubuntu Jammy
 		MK_CPPFLAGS += -I/usr/lib/llvm-14/lib/clang/14.0.0/include
 		MK_LDFLAGS  += -L/usr/lib/llvm-14/lib
 	endif # GGML_NO_OPENMP
 	CC  := $(MUSA_PATH)/bin/clang
 	CXX := $(MUSA_PATH)/bin/clang++
 	MCC := $(CCACHE) $(MUSA_PATH)/bin/mcc
 	MUSAFLAGS += $(addprefix --cuda-gpu-arch=, $(MTGPU_TARGETS))
 ifdef GGML_CUDA_FORCE_MMQ
 	MUSAFLAGS += -DGGML_CUDA_FORCE_MMQ
 endif # GGML_CUDA_FORCE_MMQ
 ifdef GGML_CUDA_FORCE_CUBLAS
 	MUSAFLAGS += -DGGML_CUDA_FORCE_CUBLAS
 endif # GGML_CUDA_FORCE_CUBLAS
 ifdef GGML_CUDA_F16
 	MUSAFLAGS += -DGGML_CUDA_F16
 endif # GGML_CUDA_F16
 ifdef GGML_CUDA_DMMV_F16
 	MUSAFLAGS += -DGGML_CUDA_F16
 endif # GGML_CUDA_DMMV_F16
 ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
 	MUSAFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(GGML_CUDA_PEER_MAX_BATCH_SIZE)
 else
 	MUSAFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128
 endif # GGML_CUDA_PEER_MAX_BATCH_SIZE
 ifdef GGML_CUDA_NO_PEER_COPY
 	MUSAFLAGS += -DGGML_CUDA_NO_PEER_COPY
 endif # GGML_CUDA_NO_PEER_COPY
 ifdef GGML_CUDA_FA_ALL_QUANTS
 	MUSAFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
 endif # GGML_CUDA_FA_ALL_QUANTS
 	OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o
 	OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
 	OBJ_GGML_EXT += $(OBJ_CUDA_TMPL)
 ggml/src/ggml-cuda/ggml-cuda.o: \
 	ggml/src/ggml-cuda/ggml-cuda.cu \
 	ggml/include/ggml-cuda.h \
 	ggml/include/ggml.h \
 	ggml/include/ggml-backend.h \
 	ggml/src/ggml-backend-impl.h \
 	ggml/src/ggml-common.h \
 	$(wildcard ggml/src/ggml-cuda/*.cuh)
 	$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -x musa -mtgpu -c -o $@ $<
 ggml/src/ggml-cuda/%.o: \
 	ggml/src/ggml-cuda/%.cu \
 	ggml/include/ggml.h \
 	ggml/src/ggml-common.h \
 	ggml/src/ggml-cuda/common.cuh
 	$(MCC) $(CXXFLAGS) $(MUSAFLAGS) -x musa -mtgpu -c -o $@ $<
 endif # GGML_MUSA
 ifdef GGML_METAL
-	MK_CPPFLAGS += -DGGML_USE_METAL
+	MK_CPPFLAGS  += -DGGML_USE_METAL
-	MK_LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit
+	MK_LDFLAGS   += -framework Foundation -framework Metal -framework MetalKit
-	OBJ_GGML	+= ggml/src/ggml-metal.o
+	OBJ_GGML_EXT += ggml/src/ggml-metal/ggml-metal.o
 ifdef GGML_METAL_USE_BF16
 	MK_CPPFLAGS += -DGGML_METAL_USE_BF16
 endif # GGML_METAL_USE_BF16
 ifdef GGML_METAL_NDEBUG
 	MK_CPPFLAGS += -DGGML_METAL_NDEBUG
 endif
 ifdef GGML_METAL_EMBED_LIBRARY
-	MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY
+	MK_CPPFLAGS  += -DGGML_METAL_EMBED_LIBRARY
-	OBJ_GGML   += ggml/src/ggml-metal-embed.o
+	OBJ_GGML_EXT += ggml/src/ggml-metal-embed.o
 endif
 endif # GGML_METAL
 ifdef GGML_METAL
-ggml/src/ggml-metal.o: \
+ggml/src/ggml-metal/ggml-metal.o: \
-	ggml/src/ggml-metal.m \
+	ggml/src/ggml-metal/ggml-metal.m \
 	ggml/src/ggml-metal/ggml-metal-impl.h \
 	ggml/include/ggml-metal.h \
 	ggml/include/ggml.h
 	$(CC) $(CFLAGS) -c $< -o $@
 ifdef GGML_METAL_EMBED_LIBRARY
 ggml/src/ggml-metal-embed.o: \
-	ggml/src/ggml-metal.metal \
+	ggml/src/ggml-metal/ggml-metal.metal \
 	ggml/src/ggml-metal/ggml-metal-impl.h \
 	ggml/src/ggml-common.h
 	@echo "Embedding Metal library"
-	@sed -e '/#include "ggml-common.h"/r ggml/src/ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml/src/ggml-metal.metal > ggml/src/ggml-metal-embed.metal
+	@sed -e '/__embed_ggml-common.h__/r      ggml/src/ggml-common.h'                -e '/__embed_ggml-common.h__/d'      < ggml/src/ggml-metal/ggml-metal.metal           > ggml/src/ggml-metal/ggml-metal-embed.metal.tmp
 	@sed -e '/#include "ggml-metal-impl.h"/r ggml/src/ggml-metal/ggml-metal-impl.h' -e '/#include "ggml-metal-impl.h"/d' < ggml/src/ggml-metal/ggml-metal-embed.metal.tmp > ggml/src/ggml-metal/ggml-metal-embed.metal
 	$(eval TEMP_ASSEMBLY=$(shell mktemp -d))
-	@echo ".section __DATA, __ggml_metallib"            >  $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+	@echo ".section __DATA, __ggml_metallib"                       >  $(TEMP_ASSEMBLY)/ggml-metal-embed.s
-	@echo ".globl _ggml_metallib_start"                 >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+	@echo ".globl _ggml_metallib_start"                            >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
-	@echo "_ggml_metallib_start:"                       >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+	@echo "_ggml_metallib_start:"                                  >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
-	@echo ".incbin \"ggml/src/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+	@echo ".incbin \"ggml/src/ggml-metal/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
-	@echo ".globl _ggml_metallib_end"                   >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+	@echo ".globl _ggml_metallib_end"                              >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
-	@echo "_ggml_metallib_end:"                         >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+	@echo "_ggml_metallib_end:"                                    >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
 	$(CC) $(CFLAGS) -c $(TEMP_ASSEMBLY)/ggml-metal-embed.s -o $@
 	@rm -f ${TEMP_ASSEMBLY}/ggml-metal-embed.s
 	@rmdir ${TEMP_ASSEMBLY}
 endif
 endif # GGML_METAL
-OBJ_GGML += \
+DIR_GGML = ggml
-	ggml/src/ggml.o \
+DIR_LLAMA = src
-	ggml/src/ggml-cpu.o \
+DIR_COMMON = common
-	ggml/src/ggml-alloc.o \
+
-	ggml/src/ggml-backend.o \
+OBJ_GGML = \
-	ggml/src/ggml-quants.o \
+	$(DIR_GGML)/src/ggml.o \
-	ggml/src/ggml-aarch64.o
+	$(DIR_GGML)/src/ggml-aarch64.o \
 	$(DIR_GGML)/src/ggml-alloc.o \
 	$(DIR_GGML)/src/ggml-backend.o \
 	$(DIR_GGML)/src/ggml-backend-reg.o \
 	$(DIR_GGML)/src/ggml-opt.o \
 	$(DIR_GGML)/src/ggml-quants.o \
 	$(DIR_GGML)/src/ggml-threading.o \
 	$(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \
 	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o \
 	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-aarch64.o \
 	$(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \
 	$(OBJ_GGML_EXT)
 OBJ_LLAMA = \
-	src/llama.o \
+	$(DIR_LLAMA)/llama.o \
-	src/llama-vocab.o \
+	$(DIR_LLAMA)/llama-vocab.o \
-	src/llama-grammar.o \
+	$(DIR_LLAMA)/llama-grammar.o \
-	src/llama-sampling.o \
+	$(DIR_LLAMA)/llama-sampling.o \
-	src/unicode.o \
+	$(DIR_LLAMA)/unicode.o \
-	src/unicode-data.o
+	$(DIR_LLAMA)/unicode-data.o
 OBJ_COMMON = \
-	common/common.o \
+	$(DIR_COMMON)/common.o \
-	common/arg.o \
+	$(DIR_COMMON)/arg.o \
-	common/log.o \
+	$(DIR_COMMON)/log.o \
-	common/console.o \
+	$(DIR_COMMON)/console.o \
-	common/ngram-cache.o \
+	$(DIR_COMMON)/ngram-cache.o \
-	common/sampling.o \
+	$(DIR_COMMON)/sampling.o \
-	common/build-info.o \
+	$(DIR_COMMON)/speculative.o \
-	common/json-schema-to-grammar.o
+	$(DIR_COMMON)/build-info.o \
 	$(DIR_COMMON)/json-schema-to-grammar.o
 OBJ_ALL = $(OBJ_GGML) $(OBJ_LLAMA) $(OBJ_COMMON)
@ -993,7 +1026,6 @@ $(info I CXX:       $(shell $(CXX)  --version | head -n 1))
 ifdef GGML_CUDA
 $(info I NVCC:      $(shell $(NVCC) --version | tail -n 1))
 CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
 ifndef GGML_MUSA
 ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
 ifndef CUDA_DOCKER_ARCH
@ -1003,7 +1035,6 @@ endif # CUDA_POWER_ARCH
 endif # CUDA_DOCKER_ARCH
 endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
 endif # GGML_MUSA
 endif # GGML_CUDA
 $(info )
@ -1040,224 +1071,78 @@ endif
 # Build libraries
 #
-# ggml
+# Libraries
 LIB_GGML   = libggml.so
 LIB_GGML_S = libggml.a
-ggml/src/ggml.o: \
+LIB_LLAMA   = libllama.so
-	ggml/src/ggml.c \
+LIB_LLAMA_S = libllama.a
 	ggml/include/ggml.h
 	$(CC)  $(CFLAGS)   -c $< -o $@
-ggml/src/ggml-cpu.o: \
+LIB_COMMON   = libcommon.so
-	ggml/src/ggml-cpu.c \
+LIB_COMMON_S = libcommon.a
 	ggml/include/ggml.h \
 	ggml/src/ggml-common.h
 	$(CC)  $(CFLAGS)   -c $< -o $@
-ggml/src/ggml-alloc.o: \
+# Targets
-	ggml/src/ggml-alloc.c \
+BUILD_TARGETS += $(LIB_GGML) $(LIB_GGML_S) $(LIB_LLAMA) $(LIB_LLAMA_S) $(LIB_COMMON) $(LIB_COMMON_S)
 	ggml/include/ggml.h \
 	ggml/include/ggml-alloc.h
 	$(CC)  $(CFLAGS)   -c $< -o $@
-ggml/src/ggml-backend.o: \
+# Dependency files
-	ggml/src/ggml-backend.cpp \
+DEP_FILES = $(OBJ_GGML:.o=.d) $(OBJ_LLAMA:.o=.d) $(OBJ_COMMON:.o=.d)
 	ggml/src/ggml-backend-impl.h \
 	ggml/include/ggml.h \
 	ggml/include/ggml-backend.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
-ggml/src/ggml-quants.o: \
+# Default target
-	ggml/src/ggml-quants.c \
+all: $(BUILD_TARGETS)
 	ggml/include/ggml.h \
 	ggml/src/ggml-quants.h \
 	ggml/src/ggml-common.h
 	$(CC) $(CFLAGS)    -c $< -o $@
-ggml/src/ggml-aarch64.o: \
+# Note: need this exception because `ggml-cpu.c` and `ggml-cpu.cpp` both produce the same obj/dep files
-	ggml/src/ggml-aarch64.c \
+#       g++ -M -I ./ggml/include/ -I ./ggml/src ggml/src/ggml-cpu/ggml-cpu.cpp | grep ggml
-	ggml/include/ggml.h \
+$(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o: \
-	ggml/src/ggml-aarch64.h \
+	ggml/src/ggml-cpu/ggml-cpu.cpp \
-	ggml/src/ggml-common.h
+	ggml/include/ggml-backend.h \
 	$(CC) $(CFLAGS)    -c $< -o $@
 ggml/src/ggml-blas.o: \
 	ggml/src/ggml-blas.cpp \
 	ggml/include/ggml-blas.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 ifndef GGML_NO_LLAMAFILE
 ggml/src/llamafile/sgemm.o: \
 	ggml/src/llamafile/sgemm.cpp \
 	ggml/src/llamafile/sgemm.h \
 	ggml/include/ggml.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 endif # GGML_NO_LLAMAFILE
 ifndef GGML_NO_AMX
 ggml/src/ggml-amx.o: \
 	ggml/src/ggml-amx.cpp \
 	ggml/include/ggml-amx.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 ggml/src/ggml-amx/mmq.o: \
 	ggml/src/ggml-amx/mmq.cpp \
 	ggml/src/ggml-amx/mmq.h \
 	ggml/include/ggml.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 endif
 ifdef GGML_RPC
 ggml/src/ggml-rpc.o: \
 	ggml/src/ggml-rpc.cpp \
 	ggml/include/ggml-rpc.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 endif # GGML_RPC
 $(LIB_GGML): \
 	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 $(LIB_GGML_S): \
 	$(OBJ_GGML)
 	ar rcs $(LIB_GGML_S) $^
 # llama
 src/unicode.o: \
 	src/unicode.cpp \
 	src/unicode.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 src/unicode-data.o: \
 	src/unicode-data.cpp \
 	src/unicode-data.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 src/llama.o: \
 	src/llama.cpp \
 	src/llama-impl.h \
 	src/llama-vocab.h \
 	src/llama-grammar.h \
 	src/llama-sampling.h \
 	src/unicode.h \
 	include/llama.h \
 	ggml/include/ggml-cuda.h \
 	ggml/include/ggml-metal.h \
 	ggml/include/ggml.h \
 	ggml/include/ggml-alloc.h \
-	ggml/include/ggml-backend.h
+	ggml/src/ggml-backend-impl.h \
-	$(CXX) $(CXXFLAGS) -c $< -o $@
+	ggml/include/ggml-cpu.h \
 	ggml/src/ggml-impl.h
 	$(CXX) $(CXXFLAGS)   -c $< -o $@
-src/llama-vocab.o: \
+# Rules for building object files
-	src/llama-vocab.cpp \
+$(DIR_GGML)/%.o: $(DIR_GGML)/%.c
-	src/llama-vocab.h \
+	$(CC) $(CFLAGS) -MMD -c $< -o $@
 	src/llama-impl.h \
 	include/llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
-src/llama-grammar.o: \
+$(DIR_GGML)/%.o: $(DIR_GGML)/%.cpp
-	src/llama-grammar.cpp \
+	$(CXX) $(CXXFLAGS) -MMD -c $< -o $@
 	src/llama-grammar.h \
 	src/llama-impl.h \
 	src/llama-vocab.h \
 	src/llama-sampling.h \
 	include/llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
-src/llama-sampling.o: \
+$(DIR_LLAMA)/%.o: $(DIR_LLAMA)/%.cpp
-	src/llama-sampling.cpp \
+	$(CXX) $(CXXFLAGS) -MMD -c $< -o $@
 	src/llama-sampling.h \
 	src/llama-impl.h \
 	include/llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
-$(LIB_LLAMA): \
+$(DIR_COMMON)/%.o: $(DIR_COMMON)/%.cpp
-	$(OBJ_LLAMA) \
+	$(CXX) $(CXXFLAGS) -MMD -c $< -o $@
-	$(LIB_GGML)
+
 # Rules for building libraries
 $(LIB_GGML): $(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
-$(LIB_LLAMA_S): \
+$(LIB_GGML_S): $(OBJ_GGML)
-	$(OBJ_LLAMA)
+	ar rcs $(LIB_GGML_S) $^
 $(LIB_LLAMA): $(OBJ_LLAMA) $(LIB_GGML)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 $(LIB_LLAMA_S): $(OBJ_LLAMA)
 	ar rcs $(LIB_LLAMA_S) $^
-# common
+$(LIB_COMMON): $(OBJ_COMMON) $(LIB_LLAMA) $(LIB_GGML)
 common/common.o: \
 	common/common.cpp \
 	common/common.h \
 	common/console.h \
 	common/sampling.h \
 	common/json.hpp \
 	common/json-schema-to-grammar.h \
 	include/llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 common/arg.o: \
 	common/arg.cpp \
 	common/arg.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 common/log.o: \
 	common/log.cpp \
 	common/log.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 common/sampling.o: \
 	common/sampling.cpp \
 	common/sampling.h \
 	include/llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 common/console.o: \
 	common/console.cpp \
 	common/console.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 common/json-schema-to-grammar.o: \
 	common/json-schema-to-grammar.cpp \
 	common/json-schema-to-grammar.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 common/ngram-cache.o: \
 	common/ngram-cache.cpp \
 	common/ngram-cache.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 $(LIB_COMMON): \
 	$(OBJ_COMMON) \
 	$(LIB_LLAMA) \
 	$(LIB_GGML)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
-$(LIB_COMMON_S): \
+$(LIB_COMMON_S): $(OBJ_COMMON)
 	$(OBJ_COMMON)
 	ar rcs $(LIB_COMMON_S) $^
 # Include dependency files
 -include $(DEP_FILES)
 # Clean rule
 clean:
-	rm -vrf *.dot $(BUILD_TARGETS) $(TEST_TARGETS)
+	rm -vrf $(BUILD_TARGETS) $(TEST_TARGETS)
-	rm -rvf src/*.o
+	rm -rvf *.a *.dll *.so *.dot
-	rm -rvf tests/*.o
+	find ggml src common tests examples pocs -type f -name "*.o" -delete
-	rm -rvf examples/*.o
+	find ggml src common tests examples pocs -type f -name "*.d" -delete
 	rm -rvf common/*.o
 	rm -rvf *.a
 	rm -rvf *.dll
 	rm -rvf *.so
 	rm -rvf *.dot
 	rm -rvf ggml/*.a
 	rm -rvf ggml/*.dll
 	rm -rvf ggml/*.so
 	rm -vrf ggml/src/*.o
 	rm -rvf ggml/src/llamafile/*.o
 	rm -rvf common/build-info.cpp
 	rm -vrf ggml/src/ggml-metal-embed.metal
 	rm -vrf ggml/src/ggml-cuda/*.o
 	rm -vrf ggml/src/ggml-cuda/template-instances/*.o
 	rm -vrf ggml/src/ggml-amx/*.o
 	rm -rvf $(BUILD_TARGETS)
 	rm -rvf $(TEST_TARGETS)
 	rm -f vulkan-shaders-gen ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp
 	rm -rvf $(LEGACY_TARGETS_CLEAN)
 	find examples pocs -type f -name "*.o" -delete
 #
 # Examples
@ -1283,6 +1168,11 @@ llama-infill: examples/infill/infill.cpp \
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 llama-run: examples/run/run.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 llama-simple: examples/simple/simple.cpp \
 	$(OBJ_ALL)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
@ -1563,11 +1453,6 @@ tests/test-json-schema-to-grammar: tests/test-json-schema-to-grammar.cpp \
 	$(CXX) $(CXXFLAGS) -Iexamples/server -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 tests/test-grad0: tests/test-grad0.cpp \
 	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 tests/test-opt: tests/test-opt.cpp \
 	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
--- a/Package.swift
+++ b/Package.swift
@ -10,11 +10,16 @@ var sources = [
    "src/unicode.cpp",
    "src/unicode-data.cpp",
    "ggml/src/ggml.c",
-    "ggml/src/ggml-cpu.c",
+    "ggml/src/ggml-aarch64.c",
    "ggml/src/ggml-alloc.c",
    "ggml/src/ggml-backend.cpp",
    "ggml/src/ggml-backend-reg.cpp",
    "ggml/src/ggml-cpu/ggml-cpu.c",
    "ggml/src/ggml-cpu/ggml-cpu.cpp",
    "ggml/src/ggml-cpu/ggml-cpu-aarch64.c",
    "ggml/src/ggml-cpu/ggml-cpu-quants.c",
    "ggml/src/ggml-threading.cpp",
    "ggml/src/ggml-quants.c",
    "ggml/src/ggml-aarch64.c",
 ]
 var resources: [Resource] = []
@ -22,6 +27,7 @@ var linkerSettings: [LinkerSetting] = []
 var cSettings: [CSetting] =  [
    .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
    .unsafeFlags(["-fno-objc-arc"]),
    .headerSearchPath("ggml/src"),
    // NOTE: NEW_LAPACK will required iOS version 16.4+
    // We should consider add this in the future when we drop support for iOS 14
    // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
@ -30,13 +36,15 @@ var cSettings: [CSetting] =  [
 ]
 #if canImport(Darwin)
-sources.append("ggml/src/ggml-metal.m")
+sources.append("ggml/src/ggml-common.h")
-resources.append(.process("ggml/src/ggml-metal.metal"))
+sources.append("ggml/src/ggml-metal/ggml-metal.m")
 resources.append(.process("ggml/src/ggml-metal/ggml-metal.metal"))
 linkerSettings.append(.linkedFramework("Accelerate"))
 cSettings.append(
    contentsOf: [
        .define("GGML_USE_ACCELERATE"),
-        .define("GGML_USE_METAL")
+        .define("GGML_USE_METAL"),
        .define("GGML_USE_CPU")
    ]
 )
 #endif
@ -61,13 +69,15 @@ let package = Package(
            name: "llama",
            path: ".",
            exclude: [
               "build",
               "cmake",
               "examples",
               "scripts",
               "models",
               "tests",
               "CMakeLists.txt",
-               "Makefile"
+               "Makefile",
               "ggml/src/ggml-metal-embed.metal"
            ],
            sources: sources,
            resources: resources,
--- a/README.md
+++ b/README.md
@ -131,6 +131,7 @@ Typically finetunes of the base models below are supported as well.
 - Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
 - Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
 - Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
 - Flutter: [xuegao-tzx/Fllama](https://github.com/xuegao-tzx/Fllama)
 - PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
 - Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
 - Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
@ -458,14 +459,14 @@ To learn more how to measure perplexity using llama.cpp, [read this documentatio
 - Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
 - A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)
-## Other documentations
+## Other documentation
 - [main (cli)](./examples/main/README.md)
 - [server](./examples/server/README.md)
 - [jeopardy](./examples/jeopardy/README.md)
 - [GBNF grammars](./grammars/README.md)
-**Development documentations**
+**Development documentation**
 - [How to build](./docs/build.md)
 - [Running on Docker](./docs/docker.md)
--- a/ci/run.sh
+++ b/ci/run.sh
@ -39,7 +39,7 @@ SRC=`pwd`
 CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
 if [ ! -z ${GG_BUILD_METAL} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON"
 fi
 if [ ! -z ${GG_BUILD_CUDA} ]; then
--- a/cmake/llama-config.cmake.in
+++ b/cmake/llama-config.cmake.in
@ -3,18 +3,60 @@ set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
 set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
 set(LLAMA_SHARED_LIB   @BUILD_SHARED_LIBS@)
-set(GGML_BLAS       @GGML_BLAS@)
+set(GGML_STATIC @GGML_STATIC@)
-set(GGML_CUDA       @GGML_CUDA@)
+set(GGML_NATIVE @GGML_NATIVE@)
-set(GGML_METAL      @GGML_METAL@)
+set(GGML_LTO    @GGML_LTO@)
-set(GGML_HIPBLAS    @GGML_HIPBLAS@)
+set(GGML_CCACHE @GGML_CCACHE@)
 set(GGML_AVX    @GGML_AVX@)
 set(GGML_AVX2   @GGML_AVX2@)
 set(GGML_AVX512 @GGML_AVX512@)
 set(GGML_AVX512_VBMI @GGML_AVX512_VBMI@)
 set(GGML_AVX512_VNNI @GGML_AVX512_VNNI@)
 set(GGML_AVX512_BF16 @GGML_AVX512_BF16@)
 set(GGML_AMX_TILE @GGML_AMX_TILE@)
 set(GGML_AMX_INT8 @GGML_AMX_INT8@)
 set(GGML_AMX_BF16 @GGML_AMX_BF16@)
 set(GGML_FMA  @GGML_FMA@)
 set(GGML_LASX @GGML_LASX@)
 set(GGML_LSX  @GGML_LSX@)
 set(GGML_RVV  @GGML_RVV@)
 set(GGML_SVE  @GGML_SVE@)
 set(GGML_ACCELERATE @GGML_ACCELERATE@)
-set(GGML_VULKAN @GGML_VULKAN@)
+set(GGML_OPENMP  @GGML_OPENMP@)
 set(GGML_CPU_HBM @GGML_CPU_HBM@)
 set(GGML_BLAS_VENDOR @GGML_BLAS_VENDOR@)
 set(GGML_CUDA_FORCE_MMQ    @GGML_CUDA_FORCE_MMQ@)
 set(GGML_CUDA_FORCE_CUBLAS @GGML_CUDA_FORCE_CUBLAS@)
 set(GGML_CUDA_F16          @GGML_CUDA_F16@)
 set(GGML_CUDA_PEER_MAX_BATCH_SIZE @GGML_CUDA_PEER_MAX_BATCH_SIZE@)
 set(GGML_CUDA_NO_PEER_COPY  @GGML_CUDA_NO_PEER_COPY@)
 set(GGML_CUDA_NO_VMM        @GGML_CUDA_NO_VMM@)
 set(GGML_CUDA_FA_ALL_QUANTS @GGML_CUDA_FA_ALL_QUANTS@)
 set(GGML_CUDA_GRAPHS        @GGML_CUDA_GRAPHS@)
 set(GGML_HIP_UMA @GGML_HIP_UMA@)
 set(GGML_VULKAN_CHECK_RESULTS @GGML_VULKAN_CHECK_RESULTS@)
-set(GGML_VULKAN_DEBUG @GGML_VULKAN_DEBUG@)
+set(GGML_VULKAN_DEBUG         @GGML_VULKAN_DEBUG@)
-set(GGML_VULKAN_MEMORY_DEBUG @GGML_VULKAN_MEMORY_DEBUG@)
+set(GGML_VULKAN_MEMORY_DEBUG  @GGML_VULKAN_MEMORY_DEBUG@)
-set(GGML_VULKAN_VALIDATE @GGML_VULKAN_VALIDATE@)
+set(GGML_VULKAN_SHADER_DEBUG_INFO @GGML_VULKAN_SHADER_DEBUG_INFO@)
-set(GGML_SYCL @GGML_SYCL@)
+set(GGML_VULKAN_PERF      @GGML_VULKAN_PERF@)
-set(GGML_OPENMP @GGML_OPENMP@)
+set(GGML_VULKAN_VALIDATE  @GGML_VULKAN_VALIDATE@)
 set(GGML_VULKAN_RUN_TESTS @GGML_VULKAN_RUN_TESTS@)
 set(GGML_METAL_USE_BF16 @GGML_METAL_USE_BF16@)
 set(GGML_METAL_NDEBUG   @GGML_METAL_NDEBUG@)
 set(GGML_METAL_SHADER_DEBUG  @GGML_METAL_SHADER_DEBUG@)
 set(GGML_METAL_EMBED_LIBRARY @GGML_METAL_EMBED_LIBRARY@)
 set(GGML_METAL_MACOSX_VERSION_MIN @GGML_METAL_MACOSX_VERSION_MIN@)
 set(GGML_METAL_STD @GGML_METAL_STD@)
 set(GGML_SYCL_F16    @GGML_SYCL_F16@)
 set(GGML_SYCL_TARGET @GGML_SYCL_TARGET@)
 set(GGML_SYCL_DEVICE_ARCH @GGML_SYCL_DEVICE_ARCH@)
@PACKAGE_INIT@
@ -22,65 +64,111 @@ set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
 set_and_check(LLAMA_LIB_DIR     "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
 set_and_check(LLAMA_BIN_DIR     "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
 # Ensure transient dependencies satisfied
 find_package(Threads REQUIRED)
-if (APPLE AND GGML_ACCELERATE)
+set(_llama_transient_defines "@GGML_TRANSIENT_DEFINES@")
-    find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
+set(_llama_link_deps "")
 set(_llama_link_opts "")
 foreach(_ggml_lib ggml ggml-base)
    string(REPLACE "-" "_" _ggml_lib_var "${_ggml_lib}_LIBRARY")
    find_library(${_ggml_lib_var} ${_ggml_lib}
        REQUIRED
        HINTS ${LLAMA_LIB_DIR}
        NO_CMAKE_FIND_ROOT_PATH
    )
    list(APPEND _llama_link_deps "${${_ggml_lib_var}}")
    message(STATUS "Found ${${_ggml_lib_var}}")
 endforeach()
 foreach(backend amx blas cann cpu cuda hip kompute metal musa rpc sycl vulkan)
    string(TOUPPER "GGML_${backend}" backend_id)
    set(_ggml_lib "ggml-${backend}")
    string(REPLACE "-" "_" _ggml_lib_var "${_ggml_lib}_LIBRARY")
    find_library(${_ggml_lib_var} ${_ggml_lib}
        HINTS ${LLAMA_LIB_DIR}
        NO_CMAKE_FIND_ROOT_PATH
    )
    if(${_ggml_lib_var})
        list(APPEND _llama_link_deps "${${_ggml_lib_var}}")
        set(${backend_id} ON)
        message(STATUS "Found backend ${${_ggml_lib_var}}")
    else()
        set(${backend_id} OFF)
    endif()
 endforeach()
 if (NOT LLAMA_SHARED_LIB)
    if (APPLE AND GGML_ACCELERATE)
        find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
        list(APPEND _llama_link_deps ${ACCELERATE_FRAMEWORK})
    endif()
    if (GGML_OPENMP)
        find_package(OpenMP REQUIRED)
        list(APPEND _llama_link_deps OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
    endif()
    if (GGML_CPU_HBM)
        find_library(memkind memkind REQUIRED)
        list(APPEND _llama_link_deps memkind)
    endif()
    if (GGML_BLAS)
        find_package(BLAS REQUIRED)
        list(APPEND _llama_link_deps ${BLAS_LIBRARIES})
        list(APPEND _llama_link_opts ${BLAS_LINKER_FLAGS})
    endif()
    if (GGML_CUDA)
        find_package(CUDAToolkit REQUIRED)
    endif()
    if (GGML_METAL)
        find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
        find_library(METAL_FRAMEWORK    Metal REQUIRED)
        find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
        list(APPEND _llama_link_deps ${FOUNDATION_LIBRARY}
                                     ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK})
    endif()
    if (GGML_VULKAN)
        find_package(Vulkan REQUIRED)
        list(APPEND _llama_link_deps Vulkan::Vulkan)
    endif()
    if (GGML_HIP)
        find_package(hip     REQUIRED)
        find_package(hipblas REQUIRED)
        find_package(rocblas REQUIRED)
        list(APPEND _llama_link_deps hip::host roc::rocblas roc::hipblas)
    endif()
    if (GGML_SYCL)
        find_package(DNNL)
        if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
            list(APPEND _llama_link_deps DNNL::dnnl)
        endif()
        if (WIN32)
            find_package(IntelSYCL REQUIRED)
            find_package(MKL       REQUIRED)
            list(APPEND _llama_link_deps IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
        endif()
    endif()
 endif()
 if (GGML_BLAS)
    find_package(BLAS REQUIRED)
 endif()
 if (GGML_CUDA)
    find_package(CUDAToolkit REQUIRED)
 endif()
 if (GGML_METAL)
    find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
    find_library(METAL_FRAMEWORK Metal REQUIRED)
    find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
 endif()
 if (GGML_VULKAN)
    find_package(Vulkan REQUIRED)
 endif()
 if (GGML_HIPBLAS)
    find_package(hip REQUIRED)
    find_package(hipblas REQUIRED)
    find_package(rocblas REQUIRED)
 endif()
 if (GGML_SYCL)
    find_package(IntelSYCL REQUIRED)
    find_package(MKL REQUIRED)
 endif()
 if (GGML_OPENMP)
    find_package(OpenMP REQUIRED)
 endif()
 find_library(ggml_LIBRARY ggml
    REQUIRED
    HINTS ${LLAMA_LIB_DIR})
 find_library(llama_LIBRARY llama
    REQUIRED
-    HINTS ${LLAMA_LIB_DIR})
+    HINTS ${LLAMA_LIB_DIR}
-
+    NO_CMAKE_FIND_ROOT_PATH
-set(_llama_link_deps "${ggml_LIBRARY}" "@GGML_LINK_LIBRARIES@")
+)
 set(_llama_transient_defines "@GGML_TRANSIENT_DEFINES@")
 add_library(llama UNKNOWN IMPORTED)
 set_target_properties(llama
    PROPERTIES
        INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
        INTERFACE_LINK_LIBRARIES "${_llama_link_deps}"
        INTERFACE_LINK_OPTIONS   "${_llama_link_opts}"
        INTERFACE_COMPILE_DEFINITIONS "${_llama_transient_defines}"
        IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
        IMPORTED_LOCATION "${llama_LIBRARY}"
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@ -66,6 +66,8 @@ add_library(${TARGET} STATIC
    ngram-cache.h
    sampling.cpp
    sampling.h
    speculative.cpp
    speculative.h
    )
 if (BUILD_SHARED_LIBS)
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -233,10 +233,11 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
        }
    }
-    postprocess_cpu_params(params.cpuparams, nullptr);
+    postprocess_cpu_params(params.cpuparams,       nullptr);
    postprocess_cpu_params(params.cpuparams_batch, &params.cpuparams);
-    postprocess_cpu_params(params.draft_cpuparams, &params.cpuparams);
+
-    postprocess_cpu_params(params.draft_cpuparams_batch, &params.cpuparams_batch);
+    postprocess_cpu_params(params.speculative.cpuparams,       &params.cpuparams);
    postprocess_cpu_params(params.speculative.cpuparams_batch, &params.cpuparams_batch);
    if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
        throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
@ -251,7 +252,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
        for (auto & antiprompt : params.antiprompt) {
            string_process_escapes(antiprompt);
        }
-        for (auto & seq_breaker : params.sparams.dry_sequence_breakers) {
+        for (auto & seq_breaker : params.sampling.dry_sequence_breakers) {
            string_process_escapes(seq_breaker);
        }
    }
@ -297,6 +298,27 @@ static void common_params_print_usage(common_params_context & ctx_arg) {
    print_options(specific_options);
 }
 static std::vector<ggml_backend_dev_t> parse_device_list(const std::string & value) {
    std::vector<ggml_backend_dev_t> devices;
    auto dev_names = string_split<std::string>(value, ',');
    if (dev_names.empty()) {
        throw std::invalid_argument("no devices specified");
    }
    if (dev_names.size() == 1 && dev_names[0] == "none") {
        devices.push_back(nullptr);
    } else {
        for (const auto & device : dev_names) {
            auto * dev = ggml_backend_dev_by_name(device.c_str());
            if (!dev || ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_GPU) {
                throw std::invalid_argument(string_format("invalid device: %s", device.c_str()));
            }
            devices.push_back(dev);
        }
        devices.push_back(nullptr);
    }
    return devices;
 }
 bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
    auto ctx_arg = common_params_parser_init(params, ex, print_usage);
    const common_params params_org = ctx_arg.params; // the example can modify the default params
@ -323,13 +345,16 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
 }
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
    // load dynamic backends
    ggml_backend_load_all();
    common_params_context ctx_arg(params);
    ctx_arg.print_usage = print_usage;
    ctx_arg.ex          = ex;
    std::string sampler_type_chars;
    std::string sampler_type_names;
-    for (const auto & sampler : params.sparams.samplers) {
+    for (const auto & sampler : params.sampling.samplers) {
        sampler_type_chars += common_sampler_type_to_chr(sampler);
        sampler_type_names += common_sampler_type_to_str(sampler) + ";";
    }
@ -407,26 +432,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            }
        }
    ));
    add_opt(common_arg(
        {"-td", "--threads-draft"}, "N",
        "number of threads to use during generation (default: same as --threads)",
        [](common_params & params, int value) {
            params.draft_cpuparams.n_threads = value;
            if (params.draft_cpuparams.n_threads <= 0) {
                params.draft_cpuparams.n_threads = std::thread::hardware_concurrency();
            }
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
    add_opt(common_arg(
        {"-tbd", "--threads-batch-draft"}, "N",
        "number of threads to use during batch and prompt processing (default: same as --threads-draft)",
        [](common_params & params, int value) {
            params.draft_cpuparams_batch.n_threads = value;
            if (params.draft_cpuparams_batch.n_threads <= 0) {
                params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency();
            }
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
    add_opt(common_arg(
        {"-C", "--cpu-mask"}, "M",
        "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")",
@ -515,108 +520,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.cpuparams_batch.poll = value;
        }
    ));
    add_opt(common_arg(
        {"-Cd", "--cpu-mask-draft"}, "M",
        "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
        [](common_params & params, const std::string & mask) {
            params.draft_cpuparams.mask_valid = true;
            if (!parse_cpu_mask(mask, params.draft_cpuparams.cpumask)) {
                throw std::invalid_argument("invalid cpumask");
            }
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
    add_opt(common_arg(
        {"-Crd", "--cpu-range-draft"}, "lo-hi",
        "Ranges of CPUs for affinity. Complements --cpu-mask-draft",
        [](common_params & params, const std::string & range) {
            params.draft_cpuparams.mask_valid = true;
            if (!parse_cpu_range(range, params.draft_cpuparams.cpumask)) {
                throw std::invalid_argument("invalid range");
            }
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
    add_opt(common_arg(
        {"--cpu-strict-draft"}, "<0|1>",
        "Use strict CPU placement for draft model (default: same as --cpu-strict)",
        [](common_params & params, int value) {
            params.draft_cpuparams.strict_cpu = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
    add_opt(common_arg(
        {"--prio-draft"}, "N",
        string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams.priority),
        [](common_params & params, int prio) {
            if (prio < 0 || prio > 3) {
                throw std::invalid_argument("invalid value");
            }
            params.draft_cpuparams.priority = (enum ggml_sched_priority) prio;
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
    add_opt(common_arg(
        {"--poll-draft"}, "<0|1>",
        "Use polling to wait for draft model work (default: same as --poll])",
        [](common_params & params, int value) {
            params.draft_cpuparams.poll = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
    add_opt(common_arg(
        {"-Cbd", "--cpu-mask-batch-draft"}, "M",
        "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
        [](common_params & params, const std::string & mask) {
            params.draft_cpuparams_batch.mask_valid = true;
            if (!parse_cpu_mask(mask, params.draft_cpuparams_batch.cpumask)) {
                throw std::invalid_argument("invalid cpumask");
            }
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
    add_opt(common_arg(
        {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi",
        "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)",
        [](common_params & params, const std::string & range) {
            params.draft_cpuparams_batch.mask_valid = true;
            if (!parse_cpu_range(range, params.draft_cpuparams_batch.cpumask)) {
                throw std::invalid_argument("invalid cpumask");
            }
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
    add_opt(common_arg(
        {"--cpu-strict-batch-draft"}, "<0|1>",
        "Use strict CPU placement for draft model (default: --cpu-strict-draft)",
        [](common_params & params, int value) {
            params.draft_cpuparams_batch.strict_cpu = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
    add_opt(common_arg(
        {"--prio-batch-draft"}, "N",
        string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.draft_cpuparams_batch.priority),
        [](common_params & params, int prio) {
            if (prio < 0 || prio > 3) {
                throw std::invalid_argument("invalid value");
            }
            params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) prio;
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
    add_opt(common_arg(
        {"--poll-batch-draft"}, "<0|1>",
        "Use polling to wait for draft model work (default: --poll-draft)",
        [](common_params & params, int value) {
            params.draft_cpuparams_batch.poll = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
    add_opt(common_arg(
        {"--draft"}, "N",
        string_format("number of tokens to draft for speculative decoding (default: %d)", params.n_draft),
        [](common_params & params, int value) {
            params.n_draft = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
    add_opt(common_arg(
        {"-ps", "--p-split"}, "N",
        string_format("speculative decoding split probability (default: %.1f)", (double)params.p_split),
        [](common_params & params, const std::string & value) {
            params.p_split = std::stof(value);
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
    add_opt(common_arg(
        {"-lcs", "--lookup-cache-static"}, "FNAME",
        "path to static lookup cache to use for lookup decoding (not updated by generation)",
@ -701,7 +604,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
        [](common_params & params) {
            params.no_perf = true;
-            params.sparams.no_perf = true;
+            params.sampling.no_perf = true;
        }
    ).set_env("LLAMA_ARG_NO_PERF"));
    add_opt(common_arg(
@ -883,155 +786,155 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        string_format("samplers that will be used for generation in the order, separated by \';\'\n(default: %s)", sampler_type_names.c_str()),
        [](common_params & params, const std::string & value) {
            const auto sampler_names = string_split<std::string>(value, ';');
-            params.sparams.samplers = common_sampler_types_from_names(sampler_names, true);
+            params.sampling.samplers = common_sampler_types_from_names(sampler_names, true);
        }
    ).set_sparam());
    add_opt(common_arg(
        {"-s", "--seed"}, "SEED",
-        string_format("RNG seed (default: %d, use random seed for %d)", params.sparams.seed, LLAMA_DEFAULT_SEED),
+        string_format("RNG seed (default: %d, use random seed for %d)", params.sampling.seed, LLAMA_DEFAULT_SEED),
        [](common_params & params, const std::string & value) {
-            params.sparams.seed = std::stoul(value);
+            params.sampling.seed = std::stoul(value);
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--sampling-seq"}, "SEQUENCE",
        string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()),
        [](common_params & params, const std::string & value) {
-            params.sparams.samplers = common_sampler_types_from_chars(value);
+            params.sampling.samplers = common_sampler_types_from_chars(value);
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--ignore-eos"},
        "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)",
        [](common_params & params) {
-            params.sparams.ignore_eos = true;
+            params.sampling.ignore_eos = true;
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--penalize-nl"},
-        string_format("penalize newline tokens (default: %s)", params.sparams.penalize_nl ? "true" : "false"),
+        string_format("penalize newline tokens (default: %s)", params.sampling.penalize_nl ? "true" : "false"),
        [](common_params & params) {
-            params.sparams.penalize_nl = true;
+            params.sampling.penalize_nl = true;
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--temp"}, "N",
-        string_format("temperature (default: %.1f)", (double)params.sparams.temp),
+        string_format("temperature (default: %.1f)", (double)params.sampling.temp),
        [](common_params & params, const std::string & value) {
-            params.sparams.temp = std::stof(value);
+            params.sampling.temp = std::stof(value);
-            params.sparams.temp = std::max(params.sparams.temp, 0.0f);
+            params.sampling.temp = std::max(params.sampling.temp, 0.0f);
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--top-k"}, "N",
-        string_format("top-k sampling (default: %d, 0 = disabled)", params.sparams.top_k),
+        string_format("top-k sampling (default: %d, 0 = disabled)", params.sampling.top_k),
        [](common_params & params, int value) {
-            params.sparams.top_k = value;
+            params.sampling.top_k = value;
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--top-p"}, "N",
-        string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sparams.top_p),
+        string_format("top-p sampling (default: %.1f, 1.0 = disabled)", (double)params.sampling.top_p),
        [](common_params & params, const std::string & value) {
-            params.sparams.top_p = std::stof(value);
+            params.sampling.top_p = std::stof(value);
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--min-p"}, "N",
-        string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sparams.min_p),
+        string_format("min-p sampling (default: %.1f, 0.0 = disabled)", (double)params.sampling.min_p),
        [](common_params & params, const std::string & value) {
-            params.sparams.min_p = std::stof(value);
+            params.sampling.min_p = std::stof(value);
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--xtc-probability"}, "N",
-        string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sparams.xtc_probability),
+        string_format("xtc probability (default: %.1f, 0.0 = disabled)", (double)params.sampling.xtc_probability),
        [](common_params & params, const std::string & value) {
-            params.sparams.xtc_probability = std::stof(value);
+            params.sampling.xtc_probability = std::stof(value);
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--xtc-threshold"}, "N",
-        string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sparams.xtc_threshold),
+        string_format("xtc threshold (default: %.1f, 1.0 = disabled)", (double)params.sampling.xtc_threshold),
        [](common_params & params, const std::string & value) {
-            params.sparams.xtc_threshold = std::stof(value);
+            params.sampling.xtc_threshold = std::stof(value);
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--typical"}, "N",
-        string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sparams.typ_p),
+        string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sampling.typ_p),
        [](common_params & params, const std::string & value) {
-            params.sparams.typ_p = std::stof(value);
+            params.sampling.typ_p = std::stof(value);
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--repeat-last-n"}, "N",
-        string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sparams.penalty_last_n),
+        string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sampling.penalty_last_n),
        [](common_params & params, int value) {
-            params.sparams.penalty_last_n = value;
+            params.sampling.penalty_last_n = value;
-            params.sparams.n_prev = std::max(params.sparams.n_prev, params.sparams.penalty_last_n);
+            params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n);
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--repeat-penalty"}, "N",
-        string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sparams.penalty_repeat),
+        string_format("penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)params.sampling.penalty_repeat),
        [](common_params & params, const std::string & value) {
-            params.sparams.penalty_repeat = std::stof(value);
+            params.sampling.penalty_repeat = std::stof(value);
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--presence-penalty"}, "N",
-        string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_present),
+        string_format("repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_present),
        [](common_params & params, const std::string & value) {
-            params.sparams.penalty_present = std::stof(value);
+            params.sampling.penalty_present = std::stof(value);
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--frequency-penalty"}, "N",
-        string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sparams.penalty_freq),
+        string_format("repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)params.sampling.penalty_freq),
        [](common_params & params, const std::string & value) {
-            params.sparams.penalty_freq = std::stof(value);
+            params.sampling.penalty_freq = std::stof(value);
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--dry-multiplier"}, "N",
-        string_format("set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sparams.dry_multiplier),
+        string_format("set DRY sampling multiplier (default: %.1f, 0.0 = disabled)", (double)params.sampling.dry_multiplier),
        [](common_params & params, const std::string & value) {
-            params.sparams.dry_multiplier = std::stof(value);
+            params.sampling.dry_multiplier = std::stof(value);
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--dry-base"}, "N",
-        string_format("set DRY sampling base value (default: %.2f)", (double)params.sparams.dry_base),
+        string_format("set DRY sampling base value (default: %.2f)", (double)params.sampling.dry_base),
        [](common_params & params, const std::string & value) {
            float potential_base = std::stof(value);
            if (potential_base >= 1.0f)
            {
-                params.sparams.dry_base = potential_base;
+                params.sampling.dry_base = potential_base;
            }
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--dry-allowed-length"}, "N",
-        string_format("set allowed length for DRY sampling (default: %d)", params.sparams.dry_allowed_length),
+        string_format("set allowed length for DRY sampling (default: %d)", params.sampling.dry_allowed_length),
        [](common_params & params, int value) {
-            params.sparams.dry_allowed_length = value;
+            params.sampling.dry_allowed_length = value;
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--dry-penalty-last-n"}, "N",
-        string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sparams.dry_penalty_last_n),
+        string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sampling.dry_penalty_last_n),
        [](common_params & params, int value) {
-            params.sparams.dry_penalty_last_n = value;
+            params.sampling.dry_penalty_last_n = value;
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--dry-sequence-breaker"}, "STRING",
        string_format("add sequence breaker for DRY sampling, clearing out default breakers (%s) in the process; use \"none\" to not use any sequence breakers\n",
-            params.sparams.dry_sequence_breakers.empty() ? "none" :
+            params.sampling.dry_sequence_breakers.empty() ? "none" :
-            std::accumulate(std::next(params.sparams.dry_sequence_breakers.begin()),
+            std::accumulate(std::next(params.sampling.dry_sequence_breakers.begin()),
-                params.sparams.dry_sequence_breakers.end(),
+                params.sampling.dry_sequence_breakers.end(),
-                std::string("'") + (params.sparams.dry_sequence_breakers[0] == "\n" ? "\\n" : params.sparams.dry_sequence_breakers[0]) + "'",
+                std::string("'") + (params.sampling.dry_sequence_breakers[0] == "\n" ? "\\n" : params.sampling.dry_sequence_breakers[0]) + "'",
                [](const std::string& a, const std::string& b) {
                    std::string formatted_b = (b == "\n") ? "\\n" : b;
                    return a + ", '" + formatted_b + "'";
@ -1040,51 +943,51 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            static bool defaults_cleared = false;
            if (!defaults_cleared) {
-                params.sparams.dry_sequence_breakers.clear();
+                params.sampling.dry_sequence_breakers.clear();
                defaults_cleared = true;
            }
            if (value == "none") {
-                params.sparams.dry_sequence_breakers.clear();
+                params.sampling.dry_sequence_breakers.clear();
            } else {
-                params.sparams.dry_sequence_breakers.emplace_back(value);
+                params.sampling.dry_sequence_breakers.emplace_back(value);
            }
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--dynatemp-range"}, "N",
-        string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sparams.dynatemp_range),
+        string_format("dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)params.sampling.dynatemp_range),
        [](common_params & params, const std::string & value) {
-            params.sparams.dynatemp_range = std::stof(value);
+            params.sampling.dynatemp_range = std::stof(value);
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--dynatemp-exp"}, "N",
-        string_format("dynamic temperature exponent (default: %.1f)", (double)params.sparams.dynatemp_exponent),
+        string_format("dynamic temperature exponent (default: %.1f)", (double)params.sampling.dynatemp_exponent),
        [](common_params & params, const std::string & value) {
-            params.sparams.dynatemp_exponent = std::stof(value);
+            params.sampling.dynatemp_exponent = std::stof(value);
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--mirostat"}, "N",
        string_format("use Mirostat sampling.\nTop K, Nucleus and Locally Typical samplers are ignored if used.\n"
-        "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sparams.mirostat),
+        "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", params.sampling.mirostat),
        [](common_params & params, int value) {
-            params.sparams.mirostat = value;
+            params.sampling.mirostat = value;
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--mirostat-lr"}, "N",
-        string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sparams.mirostat_eta),
+        string_format("Mirostat learning rate, parameter eta (default: %.1f)", (double)params.sampling.mirostat_eta),
        [](common_params & params, const std::string & value) {
-            params.sparams.mirostat_eta = std::stof(value);
+            params.sampling.mirostat_eta = std::stof(value);
        }
    ).set_sparam());
    add_opt(common_arg(
        {"--mirostat-ent"}, "N",
-        string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sparams.mirostat_tau),
+        string_format("Mirostat target entropy, parameter tau (default: %.1f)", (double)params.sampling.mirostat_tau),
        [](common_params & params, const std::string & value) {
-            params.sparams.mirostat_tau = std::stof(value);
+            params.sampling.mirostat_tau = std::stof(value);
        }
    ).set_sparam());
    add_opt(common_arg(
@ -1100,7 +1003,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            try {
                if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
                    const float bias = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
-                    params.sparams.logit_bias.push_back({key, bias});
+                    params.sampling.logit_bias.push_back({key, bias});
                } else {
                    throw std::invalid_argument("invalid input format");
                }
@ -1111,9 +1014,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
    ).set_sparam());
    add_opt(common_arg(
        {"--grammar"}, "GRAMMAR",
-        string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sparams.grammar.c_str()),
+        string_format("BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", params.sampling.grammar.c_str()),
        [](common_params & params, const std::string & value) {
-            params.sparams.grammar = value;
+            params.sampling.grammar = value;
        }
    ).set_sparam());
    add_opt(common_arg(
@ -1127,7 +1030,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            std::copy(
                std::istreambuf_iterator<char>(file),
                std::istreambuf_iterator<char>(),
-                std::back_inserter(params.sparams.grammar)
+                std::back_inserter(params.sampling.grammar)
            );
        }
    ).set_sparam());
@ -1135,7 +1038,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        {"-j", "--json-schema"}, "SCHEMA",
        "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\nFor schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead",
        [](common_params & params, const std::string & value) {
-            params.sparams.grammar = json_schema_to_grammar(json::parse(value));
+            params.sampling.grammar = json_schema_to_grammar(json::parse(value));
        }
    ).set_sparam());
    add_opt(common_arg(
@ -1433,6 +1336,30 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            else { throw std::invalid_argument("invalid value"); }
        }
    ).set_env("LLAMA_ARG_NUMA"));
    add_opt(common_arg(
        {"-dev", "--device"}, "<dev1,dev2,..>",
        "comma-separated list of devices to use for offloading (none = don't offload)\n"
        "use --list-devices to see a list of available devices",
        [](common_params & params, const std::string & value) {
            params.devices = parse_device_list(value);
        }
    ).set_env("LLAMA_ARG_DEVICE"));
    add_opt(common_arg(
        {"--list-devices"},
        "print list of available devices and exit",
        [](common_params &) {
            printf("Available devices:\n");
            for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
                auto * dev = ggml_backend_dev_get(i);
                if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
                    size_t free, total;
                    ggml_backend_dev_memory(dev, &free, &total);
                    printf("  %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
                }
            }
            exit(0);
        }
    ));
    add_opt(common_arg(
        {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
        "number of layers to store in VRAM",
@ -1444,17 +1371,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            }
        }
    ).set_env("LLAMA_ARG_N_GPU_LAYERS"));
    add_opt(common_arg(
        {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
        "number of layers to store in VRAM for the draft model",
        [](common_params & params, int value) {
            params.n_gpu_layers_draft = value;
            if (!llama_supports_gpu_offload()) {
                fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
                fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
            }
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
    add_opt(common_arg(
        {"-sm", "--split-mode"}, "{none,layer,row}",
        "how to split the model across multiple GPUs, one of:\n"
@ -1468,10 +1384,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            } else if (arg_next == "layer") {
                params.split_mode = LLAMA_SPLIT_MODE_LAYER;
            } else if (arg_next == "row") {
 #ifdef GGML_USE_SYCL
                fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n");
                exit(1);
 #endif // GGML_USE_SYCL
                params.split_mode = LLAMA_SPLIT_MODE_ROW;
            } else {
                throw std::invalid_argument("invalid value");
@ -1593,13 +1505,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.model = value;
        }
    ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL"));
    add_opt(common_arg(
        {"-md", "--model-draft"}, "FNAME",
        "draft model for speculative decoding (default: unused)",
        [](common_params & params, const std::string & value) {
            params.model_draft = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
    add_opt(common_arg(
        {"-mu", "--model-url"}, "MODEL_URL",
        "model download url (default: unused)",
@ -1939,17 +1844,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            params.simple_io = true;
        }
    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
    add_opt(common_arg(
        {"-ld", "--logdir"}, "LOGDIR",
        "path under which to save YAML logs (no logging if unset)",
        [](common_params & params, const std::string & value) {
            params.logdir = value;
            if (params.logdir.back() != DIRECTORY_SEPARATOR) {
                params.logdir += DIRECTORY_SEPARATOR;
            }
        }
    ));
    add_opt(common_arg(
        {"--positive-file"}, "FNAME",
        string_format("positive prompts file, one prompt per line (default: '%s')", params.cvector_positive_file.c_str()),
@ -2048,5 +1942,176 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_env("LLAMA_LOG_TIMESTAMPS"));
    // speculative parameters
    add_opt(common_arg(
        {"-td", "--threads-draft"}, "N",
        "number of threads to use during generation (default: same as --threads)",
        [](common_params & params, int value) {
            params.speculative.cpuparams.n_threads = value;
            if (params.speculative.cpuparams.n_threads <= 0) {
                params.speculative.cpuparams.n_threads = std::thread::hardware_concurrency();
            }
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
    add_opt(common_arg(
        {"-tbd", "--threads-batch-draft"}, "N",
        "number of threads to use during batch and prompt processing (default: same as --threads-draft)",
        [](common_params & params, int value) {
            params.speculative.cpuparams_batch.n_threads = value;
            if (params.speculative.cpuparams_batch.n_threads <= 0) {
                params.speculative.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
            }
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
    add_opt(common_arg(
        {"-Cd", "--cpu-mask-draft"}, "M",
        "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
        [](common_params & params, const std::string & mask) {
            params.speculative.cpuparams.mask_valid = true;
            if (!parse_cpu_mask(mask, params.speculative.cpuparams.cpumask)) {
                throw std::invalid_argument("invalid cpumask");
            }
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
    add_opt(common_arg(
        {"-Crd", "--cpu-range-draft"}, "lo-hi",
        "Ranges of CPUs for affinity. Complements --cpu-mask-draft",
        [](common_params & params, const std::string & range) {
            params.speculative.cpuparams.mask_valid = true;
            if (!parse_cpu_range(range, params.speculative.cpuparams.cpumask)) {
                throw std::invalid_argument("invalid range");
            }
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
    add_opt(common_arg(
        {"--cpu-strict-draft"}, "<0|1>",
        "Use strict CPU placement for draft model (default: same as --cpu-strict)",
        [](common_params & params, int value) {
            params.speculative.cpuparams.strict_cpu = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
    add_opt(common_arg(
        {"--prio-draft"}, "N",
        string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams.priority),
        [](common_params & params, int prio) {
            if (prio < 0 || prio > 3) {
                throw std::invalid_argument("invalid value");
            }
            params.speculative.cpuparams.priority = (enum ggml_sched_priority) prio;
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
    add_opt(common_arg(
        {"--poll-draft"}, "<0|1>",
        "Use polling to wait for draft model work (default: same as --poll])",
        [](common_params & params, int value) {
            params.speculative.cpuparams.poll = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
    add_opt(common_arg(
        {"-Cbd", "--cpu-mask-batch-draft"}, "M",
        "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)",
        [](common_params & params, const std::string & mask) {
            params.speculative.cpuparams_batch.mask_valid = true;
            if (!parse_cpu_mask(mask, params.speculative.cpuparams_batch.cpumask)) {
                throw std::invalid_argument("invalid cpumask");
            }
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
    add_opt(common_arg(
        {"-Crbd", "--cpu-range-batch-draft"}, "lo-hi",
        "Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)",
        [](common_params & params, const std::string & range) {
            params.speculative.cpuparams_batch.mask_valid = true;
            if (!parse_cpu_range(range, params.speculative.cpuparams_batch.cpumask)) {
                throw std::invalid_argument("invalid cpumask");
            }
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
    add_opt(common_arg(
        {"--cpu-strict-batch-draft"}, "<0|1>",
        "Use strict CPU placement for draft model (default: --cpu-strict-draft)",
        [](common_params & params, int value) {
            params.speculative.cpuparams_batch.strict_cpu = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
    add_opt(common_arg(
        {"--prio-batch-draft"}, "N",
        string_format("set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.speculative.cpuparams_batch.priority),
        [](common_params & params, int prio) {
            if (prio < 0 || prio > 3) {
                throw std::invalid_argument("invalid value");
            }
            params.speculative.cpuparams_batch.priority = (enum ggml_sched_priority) prio;
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
    add_opt(common_arg(
        {"--poll-batch-draft"}, "<0|1>",
        "Use polling to wait for draft model work (default: --poll-draft)",
        [](common_params & params, int value) {
            params.speculative.cpuparams_batch.poll = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
    add_opt(common_arg(
        {"--draft-max", "--draft", "--draft-n"}, "N",
        string_format("number of tokens to draft for speculative decoding (default: %d)", params.speculative.n_max),
        [](common_params & params, int value) {
            params.speculative.n_max = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"--draft-min", "--draft-n-min"}, "N",
        string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
        [](common_params & params, int value) {
            params.speculative.n_min = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"--draft-p-split"}, "P",
        string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
        [](common_params & params, const std::string & value) {
            params.speculative.p_split = std::stof(value);
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
    add_opt(common_arg(
        {"--draft-p-min"}, "P",
        string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min),
        [](common_params & params, const std::string & value) {
            params.speculative.p_min = std::stof(value);
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"-cd", "--ctx-size-draft"}, "N",
        string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
        [](common_params & params, int value) {
            params.speculative.n_ctx = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"-devd", "--device-draft"}, "<dev1,dev2,..>",
        "comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
        "use --list-devices to see a list of available devices",
        [](common_params & params, const std::string & value) {
            params.speculative.devices = parse_device_list(value);
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
        "number of layers to store in VRAM for the draft model",
        [](common_params & params, int value) {
            params.speculative.n_gpu_layers = value;
            if (!llama_supports_gpu_offload()) {
                fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
                fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
            }
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
    add_opt(common_arg(
        {"-md", "--model-draft"}, "FNAME",
        "draft model for speculative decoding (default: unused)",
        [](common_params & params, const std::string & value) {
            params.speculative.model = value;
        }
    ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
    return ctx_arg;
 }
--- a/common/common.cpp
+++ b/common/common.cpp
@ -536,12 +536,12 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
                    [](const unsigned char c) { return !std::isprint(c); }),
                detokenized.end());
-        buf << "\n" << std::to_string(i)
+        buf << "\n"          << std::to_string(i)
-            << ":token '" << detokenized << "'"
+            << ", token '"   << detokenized << "'"
-            << ":pos " << std::to_string(batch.pos[i])
+            << ", pos "      << std::to_string(batch.pos[i])
-            << ":n_seq_id  " << std::to_string(batch.n_seq_id[i])
+            << ", n_seq_id " << std::to_string(batch.n_seq_id[i])
-            << ":seq_id " << std::to_string(batch.seq_id[i][0])
+            << ", seq_id "   << std::to_string(batch.seq_id[i][0])
-            << ":logits " << std::to_string(batch.logits[i]);
+            << ", logits "   << std::to_string(batch.logits[i]);
    }
    buf << " ]";
@ -875,6 +875,12 @@ struct common_init_result common_init_from_params(common_params & params) {
        return iparams;
    }
    if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
        LOG_ERR("%s: KV cache shifting is not supported for this model (--no-context-shift to disable)'\n", __func__);
        llama_free_model(model);
        return iparams;
    }
    if (!params.control_vectors.empty()) {
        if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
        if (params.control_vector_layer_end   <= 0) params.control_vector_layer_end   = llama_n_layer(model);
@ -919,9 +925,9 @@ struct common_init_result common_init_from_params(common_params & params) {
        common_lora_adapters_apply(lctx, iparams.lora_adapters);
    }
-    if (params.sparams.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
+    if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
        LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
-        params.sparams.ignore_eos = false;
+        params.sampling.ignore_eos = false;
    }
    if (params.warmup) {
@ -973,9 +979,12 @@ void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_l
    }
 }
-struct llama_model_params common_model_params_to_llama(const common_params & params) {
+struct llama_model_params common_model_params_to_llama(common_params & params) {
    auto mparams = llama_model_default_params();
    if (!params.devices.empty()) {
        mparams.devices = params.devices.data();
    }
    if (params.n_gpu_layers != -1) {
        mparams.n_gpu_layers = params.n_gpu_layers;
    }
@ -1484,6 +1493,66 @@ void common_batch_add(
    batch.n_tokens++;
 }
 //
 // Token utils
 //
 size_t common_lcp(const llama_tokens & a, const llama_tokens & b) {
    size_t i;
    for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
    return i;
 }
 size_t common_lcs(const llama_tokens & a, const llama_tokens & b) {
    // check for empty sequences
    if (a.empty() || b.empty()) {
        return 0;
    }
    // get the lengths of the input sequences
    size_t a_len = a.size();
    size_t b_len = b.size();
    // initialize the maximum length of the longest common subsequence (LCS)
    size_t max_length = 0;
    // use two rows instead of a 2D matrix to optimize space
    std::vector<size_t> prev_row(b_len + 1, 0);
    std::vector<size_t> curr_row(b_len + 1, 0);
    // iterate through the elements of a
    for (size_t i = 1; i <= a_len; i++) {
        // iterate through the elements of b
        for (size_t j = 1; j <= b_len; j++) {
            // if elements at the current positions match
            if (a[i - 1] == b[j - 1]) {
                // if it's the first element of either sequences, set LCS length to 1
                if (i == 1 || j == 1) {
                    curr_row[j] = 1;
                } else {
                    // increment LCS length by 1 compared to the previous element
                    curr_row[j] = prev_row[j - 1] + 1;
                }
                // update max_length if necessary
                if (curr_row[j] > max_length) {
                    max_length = curr_row[j];
                }
            } else {
                // reset LCS length if elements don't match
                curr_row[j] = 0;
            }
        }
        // update the previous row for the next iteration
        prev_row = curr_row;
    }
    // return the maximum length of the LCS
    return max_length;
 }
 //
 // Vocab utils
 //
@ -1890,218 +1959,3 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
    return result;
 }
 //
 // YAML utils
 //
 void yaml_dump_vector_float(FILE * stream, const char * prop_name, const std::vector<float> & data) {
    if (data.empty()) {
        fprintf(stream, "%s:\n", prop_name);
        return;
    }
    fprintf(stream, "%s: [", prop_name);
    for (size_t i = 0; i < data.size() - 1; ++i) {
        fprintf(stream, "%e, ", data[i]);
    }
    fprintf(stream, "%e]\n", data.back());
 }
 void yaml_dump_vector_int(FILE * stream, const char * prop_name, const std::vector<int> & data) {
    if (data.empty()) {
        fprintf(stream, "%s:\n", prop_name);
        return;
    }
    fprintf(stream, "%s: [", prop_name);
    for (size_t i = 0; i < data.size() - 1; ++i) {
        fprintf(stream, "%d, ", data[i]);
    }
    fprintf(stream, "%d]\n", data.back());
 }
 void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data) {
    std::string data_str(data == NULL ? "" : data);
    if (data_str.empty()) {
        fprintf(stream, "%s:\n", prop_name);
        return;
    }
    size_t pos_start = 0;
    size_t pos_found = 0;
    if (std::isspace(data_str[0]) || std::isspace(data_str.back())) {
        data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
        data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
        data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
        data_str = "\"" + data_str + "\"";
        fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
        return;
    }
    if (data_str.find('\n') == std::string::npos) {
        fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
        return;
    }
    fprintf(stream, "%s: |\n", prop_name);
    while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) {
        fprintf(stream, "  %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str());
        pos_start = pos_found + 1;
    }
 }
 void yaml_dump_non_result_info(FILE * stream, const common_params & params, const llama_context * lctx,
                               const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
    ggml_cpu_init(); // some ARM features are detected at runtime
    const auto & sparams = params.sparams;
    fprintf(stream, "build_commit: %s\n",        LLAMA_COMMIT);
    fprintf(stream, "build_number: %d\n",        LLAMA_BUILD_NUMBER);
    fprintf(stream, "cpu_has_arm_fma: %s\n",     ggml_cpu_has_arm_fma()     ? "true" : "false");
    fprintf(stream, "cpu_has_avx: %s\n",         ggml_cpu_has_avx()         ? "true" : "false");
    fprintf(stream, "cpu_has_avx_vnni: %s\n",    ggml_cpu_has_avx_vnni()    ? "true" : "false");
    fprintf(stream, "cpu_has_avx2: %s\n",        ggml_cpu_has_avx2()        ? "true" : "false");
    fprintf(stream, "cpu_has_avx512: %s\n",      ggml_cpu_has_avx512()      ? "true" : "false");
    fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
    fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
    fprintf(stream, "cpu_has_cuda: %s\n",        ggml_cpu_has_cuda()        ? "true" : "false");
    fprintf(stream, "cpu_has_vulkan: %s\n",      ggml_cpu_has_vulkan()      ? "true" : "false");
    fprintf(stream, "cpu_has_kompute: %s\n",     ggml_cpu_has_kompute()     ? "true" : "false");
    fprintf(stream, "cpu_has_fma: %s\n",         ggml_cpu_has_fma()         ? "true" : "false");
    fprintf(stream, "cpu_has_gpublas: %s\n",     ggml_cpu_has_gpublas()     ? "true" : "false");
    fprintf(stream, "cpu_has_neon: %s\n",        ggml_cpu_has_neon()        ? "true" : "false");
    fprintf(stream, "cpu_has_sve: %s\n",         ggml_cpu_has_sve()         ? "true" : "false");
    fprintf(stream, "cpu_has_f16c: %s\n",        ggml_cpu_has_f16c()        ? "true" : "false");
    fprintf(stream, "cpu_has_fp16_va: %s\n",     ggml_cpu_has_fp16_va()     ? "true" : "false");
    fprintf(stream, "cpu_has_riscv_v: %s\n",     ggml_cpu_has_riscv_v()     ? "true" : "false");
    fprintf(stream, "cpu_has_wasm_simd: %s\n",   ggml_cpu_has_wasm_simd()   ? "true" : "false");
    fprintf(stream, "cpu_has_blas: %s\n",        ggml_cpu_has_blas()        ? "true" : "false");
    fprintf(stream, "cpu_has_sse3: %s\n",        ggml_cpu_has_sse3()        ? "true" : "false");
    fprintf(stream, "cpu_has_vsx: %s\n",         ggml_cpu_has_vsx()         ? "true" : "false");
    fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false");
 #ifdef NDEBUG
    fprintf(stream, "debug: false\n");
 #else
    fprintf(stream, "debug: true\n");
 #endif // NDEBUG
    fprintf(stream, "model_desc: %s\n", model_desc);
    fprintf(stream, "n_vocab: %d  # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx)));
 #ifdef __OPTIMIZE__
    fprintf(stream, "optimize: true\n");
 #else
    fprintf(stream, "optimize: false\n");
 #endif // __OPTIMIZE__
    fprintf(stream, "time: %s\n", timestamp.c_str());
    fprintf(stream, "\n");
    fprintf(stream, "###############\n");
    fprintf(stream, "# User Inputs #\n");
    fprintf(stream, "###############\n");
    fprintf(stream, "\n");
    fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
    fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
    fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
    fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
    fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
    fprintf(stream, "dry_allowed_length: %d # default: 2\n", sparams.dry_allowed_length);
    fprintf(stream, "dry_base: %.2f # default: 1.75\n", sparams.dry_base);
    fprintf(stream, "dry_multiplier: %.1f # default: 0.0\n", sparams.dry_multiplier);
    fprintf(stream, "dry_penalty_last_n: %d # default: -1 (0 = disable, -1 = context size)\n", sparams.dry_penalty_last_n);
    fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
    fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
    fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
    yaml_dump_string_multiline(stream, "grammar", sparams.grammar.c_str());
    fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
    fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
    fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
    fprintf(stream, "ignore_eos: %s # default: false\n", sparams.ignore_eos ? "true" : "false");
    yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str());
    fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
    yaml_dump_string_multiline(stream, "in_suffix", params.input_prefix.c_str());
    fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
    fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
    fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
    fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
    fprintf(stream, "logit_bias:\n");
    for (const auto & logit_bias : sparams.logit_bias) {
        fprintf(stream, "  %d: %f", logit_bias.token, logit_bias.bias);
    }
    fprintf(stream, "lora:\n");
    for (auto & la : params.lora_adapters) {
        if (la.scale == 1.0f) {
            fprintf(stream, "  - %s\n", la.path.c_str());
        }
    }
    fprintf(stream, "lora_scaled:\n");
    for (auto & la : params.lora_adapters) {
        if (la.scale != 1.0f) {
            fprintf(stream, "  - %s: %f\n", la.path.c_str(), la.scale);
        }
    }
    fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false");
    fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
    fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
    fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
    fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
    fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
    fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
    fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH);
    fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
    fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
    fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
    fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
    fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
    fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
    fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
    fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
    fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
    fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
    yaml_dump_string_multiline(stream, "prompt", params.prompt.c_str());
    fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
    fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
    fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
    yaml_dump_vector_int(stream, "prompt_tokens", prompt_tokens);
    fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
    fprintf(stream, "reverse_prompt:\n");
    for (std::string ap : params.antiprompt) {
        size_t pos = 0;
        while ((pos = ap.find('\n', pos)) != std::string::npos) {
            ap.replace(pos, 1, "\\n");
            pos += 1;
        }
        fprintf(stream, "  - %s\n", ap.c_str());
    }
    fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
    fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
    fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
    fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
    fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
    fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
    const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
    yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
    fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
    fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
    fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
    fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
    fprintf(stream, "xtc_probability: %f # default: 0.0\n", sparams.xtc_probability);
    fprintf(stream, "xtc_threshold: %f # default: 0.1\n", sparams.xtc_threshold);
    fprintf(stream, "typ_p: %f # default: 1.0\n", sparams.typ_p);
    fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
    fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
 }
--- a/common/common.h
+++ b/common/common.h
@ -33,6 +33,8 @@ struct common_lora_adapter_container : common_lora_adapter_info {
    struct llama_lora_adapter * adapter;
 };
 using llama_tokens = std::vector<llama_token>;
 // build info
 extern int LLAMA_BUILD_NUMBER;
 extern char const * LLAMA_COMMIT;
@ -101,8 +103,8 @@ enum dimre_method {
    DIMRE_METHOD_MEAN,
 };
-// sampler parameters
+// sampling parameters
-struct common_sampler_params {
+struct common_params_sampling {
    uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
    int32_t n_prev             = 64;    // number of previous tokens to remember
@ -153,21 +155,30 @@ struct common_sampler_params {
    std::string print() const;
 };
 struct common_params_speculative {
    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
    int32_t n_ctx        =     0; // draft context size
    int32_t n_max        =    16; // maximum number of tokens to draft during speculative decoding
    int32_t n_min        =     5; // minimum number of draft tokens to use for speculative decoding
    int32_t n_gpu_layers =    -1; // number of layers to store in VRAM for the draft model (-1 - use default)
    float   p_split      =  0.1f; // speculative decoding split probability
    float   p_min        =  0.9f; // minimum speculative decoding probability (greedy)
    struct cpu_params cpuparams;
    struct cpu_params cpuparams_batch;
    std::string model = ""; // draft model for speculative decoding                          // NOLINT
 };
 struct common_params {
    int32_t n_predict             =    -1; // new tokens to predict
    int32_t n_ctx                 =  4096; // context size
    int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_ubatch              =   512; // physical batch size for prompt processing (must be >=32 to use BLAS)
    int32_t n_keep                =     0; // number of tokens to keep from initial prompt
    int32_t n_draft               =     5; // number of tokens to draft during speculative decoding
    int32_t n_chunks              =    -1; // max number of chunks to process (-1 = unlimited)
    int32_t n_parallel            =     1; // number of parallel sequences to decode
    int32_t n_sequences           =     1; // number of sequences to decode
    float   p_split               =  0.1f; // speculative decoding split probability
    int32_t n_gpu_layers          =    -1; // number of layers to store in VRAM (-1 - use default)
    int32_t n_gpu_layers_draft    =    -1; // number of layers to store in VRAM for the draft model (-1 - use default)
    int32_t main_gpu              =     0; // the GPU that is used for scratch and small tensors
    float   tensor_split[128]     =   {0}; // how split tensors should be distributed across GPUs
    int32_t grp_attn_n            =     1; // group-attention factor
    int32_t grp_attn_w            =   512; // group-attention width
    int32_t n_print               =    -1; // print token count every n tokens (-1 = disabled)
@ -178,27 +189,31 @@ struct common_params {
    float   yarn_beta_fast        = 32.0f; // YaRN low correction dim
    float   yarn_beta_slow        =  1.0f; // YaRN high correction dim
    int32_t yarn_orig_ctx         =     0; // YaRN original context length
-    float   defrag_thold          = -1.0f; // KV cache defragmentation threshold
+    float   defrag_thold          =  0.1f; // KV cache defragmentation threshold
    // offload params
    std::vector<ggml_backend_dev_t> devices;         // devices to use for offloading
    int32_t n_gpu_layers                    =    -1; // number of layers to store in VRAM (-1 - use default)
    int32_t main_gpu                        =     0; // the GPU that is used for scratch and small tensors
    float   tensor_split[128]               =   {0}; // how split tensors should be distributed across GPUs
    enum llama_split_mode        split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
    struct cpu_params cpuparams;
    struct cpu_params cpuparams_batch;
    struct cpu_params draft_cpuparams;
    struct cpu_params draft_cpuparams_batch;
    ggml_backend_sched_eval_callback cb_eval = nullptr;
    void * cb_eval_user_data                 = nullptr;
    ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
    enum llama_split_mode        split_mode        = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
    enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
    enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
    enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
-    struct common_sampler_params sparams;
+    struct common_params_sampling sampling;
    struct common_params_speculative speculative;
    std::string model                = ""; // model path                                                    // NOLINT
    std::string model_draft          = ""; // draft model for speculative decoding                          // NOLINT
    std::string model_alias          = "unknown"; // model alias                                            // NOLINT
    std::string model_url            = ""; // model url to download                                         // NOLINT
    std::string hf_token             = ""; // HF token                                                      // NOLINT
@ -209,7 +224,6 @@ struct common_params {
    std::string path_prompt_cache    = ""; // path to file for saving/loading prompt eval state             // NOLINT
    std::string input_prefix         = ""; // string to prefix user inputs with                             // NOLINT
    std::string input_suffix         = ""; // string to suffix user inputs with                             // NOLINT
    std::string logdir               = ""; // directory in which to save YAML log files                     // NOLINT
    std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding           // NOLINT
    std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding          // NOLINT
    std::string logits_file          = ""; // file for saving *all* logits                                  // NOLINT
@ -452,7 +466,7 @@ struct common_init_result {
 struct common_init_result     common_init_from_params(common_params & params);
-struct llama_model_params     common_model_params_to_llama  (const common_params & params);
+struct llama_model_params     common_model_params_to_llama  (      common_params & params);
 struct llama_context_params   common_context_params_to_llama(const common_params & params);
 struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
@ -462,7 +476,9 @@ struct llama_model * common_load_model_from_hf(const char * repo, const char * f
 // clear LoRA adapters from context, then apply new list of adapters
 void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
 //
 // Batch utils
 //
 void common_batch_clear(struct llama_batch & batch);
@ -473,6 +489,16 @@ void common_batch_add(
    const std::vector<llama_seq_id> & seq_ids,
                               bool   logits);
 //
 // Token utils
 //
 // longest common prefix
 size_t common_lcp(const llama_tokens & a, const llama_tokens & b);
 // longet common subsequence
 size_t common_lcs(const llama_tokens & a, const llama_tokens & b);
 //
 // Vocab utils
 //
@ -584,15 +610,3 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
 static const char * const LLM_KV_SPLIT_NO            = "split.no";
 static const char * const LLM_KV_SPLIT_COUNT         = "split.count";
 static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
 //
 // YAML utils
 //
 void yaml_dump_vector_float    (FILE * stream, const char * prop_name, const std::vector<float> & data);
 void yaml_dump_vector_int      (FILE * stream, const char * prop_name, const std::vector<int> & data);
 void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
 void yaml_dump_non_result_info(
    FILE * stream, const common_params & params, const llama_context * lctx,
    const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@ -99,7 +99,7 @@ struct ring_buffer {
 };
 struct common_sampler {
-    common_sampler_params params;
+    common_params_sampling params;
    struct llama_sampler * grmr;
    struct llama_sampler * chain;
@ -125,7 +125,7 @@ struct common_sampler {
    }
 };
-std::string common_sampler_params::print() const {
+std::string common_params_sampling::print() const {
    char result[1024];
    snprintf(result, sizeof(result),
@ -141,7 +141,7 @@ std::string common_sampler_params::print() const {
    return std::string(result);
 }
-struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params) {
+struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) {
    llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
    lparams.no_perf = params.no_perf;
@ -320,6 +320,45 @@ llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_co
    return cur_p.data[cur_p.selected].id;
 }
 std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first) {
    GGML_ASSERT(idxs.size() == draft.size() + 1 && "idxs.size() must be draft.size() + 1");
    std::vector<llama_token> result;
    result.reserve(idxs.size());
    size_t i = 0;
    for (; i < draft.size(); i++) {
        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
        common_sampler_accept(gsmpl, id, true);
        result.push_back(id);
        if (draft[i] != id) {
            break;
        }
    }
    if (i == draft.size()) {
        const llama_token id = common_sampler_sample(gsmpl, ctx, idxs[i], grammar_first);
        common_sampler_accept(gsmpl, id, true);
        result.push_back(id);
    }
    return result;
 }
 std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first) {
    std::vector<int> idxs(draft.size() + 1);
    for (size_t i = 0; i < idxs.size(); ++i) {
        idxs[i] = i;
    }
    return common_sampler_sample_and_accept_n(gsmpl, ctx, idxs, draft, grammar_first);
 }
 uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
    return llama_sampler_get_seed(gsmpl->chain);
 }
--- a/common/sampling.h
+++ b/common/sampling.h
@ -36,7 +36,7 @@ struct common_sampler;
 // llama_sampler API overloads
-struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params);
+struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params);
 void common_sampler_free(struct common_sampler * gsmpl);
@ -60,6 +60,27 @@ void common_perf_print(const struct llama_context * ctx, const struct common_sam
 //
 llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
 // generalized version of common_sampler_sample
 //
 // will cross-reference the sampled tokens with a batch of draft tokens and accept those that match
 // if the sampler disagrees at some point, we stop and return the accepted tokens up to now
 //
 //      common_sampler_sample_n(gsmpl, ctx, { idx }, {});
 //
 // is equivalent to
 //
 //      common_sampler_sample(gsmpl, ctx, idx);
 //      common_sampler_accept(gsmpl, token, true);
 //
 // requires: idxs.size() == draft.size() + 1
 //
 // returns at least 1 token, up to idxs.size()
 //
 std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);
 // assume idxs == [ 0, 1, 2, ..., draft.size() ]
 std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);
 uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
 // helpers
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@ -0,0 +1,270 @@
 #include "speculative.h"
 #include "log.h"
 #include "common.h"
 #include "sampling.h"
 #include <cstring>
 #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  128
 #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
 struct common_speculative {
    struct llama_context * ctx;
    struct common_sampler * smpl;
    llama_batch batch;
    llama_tokens prompt;
 };
 struct common_speculative * common_speculative_init(
        struct llama_context * ctx_dft) {
    auto * result = new common_speculative {
        /* .ctx    = */ ctx_dft,
        /* .smpl   = */ nullptr,
        /* .batch  = */ llama_batch_init(llama_n_batch(ctx_dft), 0, 1),
        /* .prompt = */ {},
    };
    // TODO: optimize or pass from outside?
 #if 0
    {
        common_params_sampling params;
        params.no_perf = false;
        params.top_k = 40;
        params.top_p = 0.9;
        params.samplers = {
            COMMON_SAMPLER_TYPE_TOP_K,
            COMMON_SAMPLER_TYPE_TOP_P,
            COMMON_SAMPLER_TYPE_INFILL,
        };
        result->smpl = common_sampler_init(llama_get_model(ctx_dft), params);
    }
 #else
    {
        common_params_sampling params;
        params.no_perf = false;
        params.top_k = 10;
        params.samplers = {
            COMMON_SAMPLER_TYPE_TOP_K,
        };
        result->smpl = common_sampler_init(llama_get_model(ctx_dft), params);
    }
 #endif
    return result;
 }
 void common_speculative_free(struct common_speculative * spec) {
    common_sampler_free(spec->smpl);
    llama_batch_free(spec->batch);
    delete spec;
 }
 bool common_speculative_are_compatible(
        const struct llama_context * ctx_tgt,
        const struct llama_context * ctx_dft) {
    const struct llama_model * model_tgt = llama_get_model(ctx_tgt);
    const struct llama_model * model_dft = llama_get_model(ctx_dft);
    const bool vocab_type_tgt = llama_vocab_type(model_tgt);
    LOG_DBG("%s: vocab_type tgt: %d\n", __func__, vocab_type_tgt);
    const bool vocab_type_dft = llama_vocab_type(model_dft);
    LOG_DBG("%s: vocab_type dft: %d\n", __func__, vocab_type_dft);
    if (vocab_type_tgt != vocab_type_dft) {
        LOG_ERR("%s: draft model vocab type must match target model to use speculation but "
                     "vocab_type_dft = %d while vocab_type_tgt = %d\n", __func__, vocab_type_dft, vocab_type_tgt);
        return false;
    }
    if (llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) ||
        llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) ||
        llama_token_bos(model_tgt) != llama_token_bos(model_dft) ||
        llama_token_eos(model_tgt) != llama_token_eos(model_dft)) {
        LOG_ERR("%s: draft model special tokens must match target model to use speculation\n", __func__);
        LOG_ERR("%s: tgt: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_tgt), llama_add_bos_token(model_tgt), llama_token_eos(model_tgt), llama_add_eos_token(model_tgt));
        LOG_ERR("%s: dft: bos = %d (%d), eos = %d (%d)\n", __func__, llama_token_bos(model_dft), llama_add_bos_token(model_dft), llama_token_eos(model_dft), llama_add_eos_token(model_dft));
        return false;
    }
    {
        const int n_vocab_tgt = llama_n_vocab(model_tgt);
        const int n_vocab_dft = llama_n_vocab(model_dft);
        const int vocab_diff = std::abs(n_vocab_tgt - n_vocab_dft);
        if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
            LOG_ERR("%s: draft model vocab must closely match target model to use speculation but "
                         "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
                    __func__, n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
            return false;
        }
        for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
            const char * token_text_tgt = llama_token_get_text(model_tgt, i);
            const char * token_text_dft = llama_token_get_text(model_dft, i);
            if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
                LOG_ERR("%s: draft model vocab must match target model to use speculation but "
                             "token %d content differs - target '%s', draft '%s'\n", __func__, i,
                        common_token_to_piece(ctx_tgt, i).c_str(),
                        common_token_to_piece(ctx_dft, i).c_str());
                return false;
            }
        }
    }
    return true;
 }
 llama_tokens common_speculative_gen_draft(
        struct common_speculative * spec,
        struct common_speculative_params params,
        const llama_tokens & prompt_tgt,
        llama_token id_last) {
    auto & batch  = spec->batch;
    auto & ctx    = spec->ctx;
    auto & smpl   = spec->smpl;
    auto & prompt = spec->prompt;
    int reuse_i = 0;
    int reuse_n = 0;
    const int n_ctx = llama_n_ctx(ctx) - params.n_draft;
    const int i_start = std::max<int>(0, (int) prompt_tgt.size() - n_ctx);
    // reuse as much as possible from the old draft context
    // ideally, the draft context should be as big as the target context and we will always reuse the entire prompt
    for (int i = 0; i < (int) prompt.size(); ++i) {
        int cur = 0;
        while (i_start + cur < (int) prompt_tgt.size() &&
               i       + cur < (int) prompt.size() &&
               prompt_tgt[i_start + cur] == prompt[i + cur]) {
            cur++;
        }
        if ((cur >= params.n_reuse || n_ctx >= (int) prompt_tgt.size()) && cur > reuse_n) {
            reuse_i = i;
            reuse_n = cur;
        }
    }
    LOG_DBG("%s: reuse_i = %d, reuse_n = %d, prompt = %d\n", __func__, reuse_i, reuse_n, (int) prompt.size());
    llama_tokens result;
    result.reserve(params.n_draft);
    if (reuse_n == 0) {
        llama_kv_cache_clear(ctx);
        prompt.clear();
    } else {
        // this happens when a previous draft has been discarded (for example, due to being too small), but the
        // target model agreed with it. in this case, we simply pass back the previous results to save compute
        if (reuse_i + reuse_n < (int) prompt.size() && prompt[reuse_i + reuse_n] == id_last) {
            for (int i = reuse_i + reuse_n + 1; i < (int) prompt.size(); ++i) {
                result.push_back(prompt[i]);
                if (params.n_draft <= (int) result.size()) {
                    break;
                }
            }
            return result;
        }
        if (reuse_i > 0) {
            llama_kv_cache_seq_rm (ctx, 0, 0, reuse_i);
            llama_kv_cache_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
            prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
        }
        if (reuse_n < (int) prompt.size()) {
            llama_kv_cache_seq_rm (ctx, 0, reuse_n, -1);
            prompt.erase(prompt.begin() + reuse_n, prompt.end());
        }
    }
    // prepare a batch to evaluate any new tokens in the prompt
    common_batch_clear(batch);
    for (size_t i = i_start + reuse_n; i < prompt_tgt.size(); ++i) {
        //LOG_DBG("i = %d, i_start = %d, reuse_n = %d, i - i_start = %d, id = %6d\n", i, i_start, reuse_n, i - i_start, prompt_tgt[i]);
        common_batch_add(batch, prompt_tgt[i], i - i_start, { 0 }, false);
        prompt.push_back(prompt_tgt[i]);
    }
    // we should rarely end-up here during normal decoding
    if (batch.n_tokens > 0) {
        //LOG_DBG("%s: draft prompt batch: %s\n", __func__, string_from(ctx, batch).c_str());
        llama_decode(ctx, batch);
    }
    const llama_pos n_past = prompt.size();
    LOG_DBG("%s: n_past = %d\n", __func__, n_past);
    common_batch_clear(batch);
    common_batch_add  (batch, id_last, n_past, { 0 }, true);
    prompt.push_back(id_last);
    //LOG_DBG("%s: draft prompt: %s\n", __func__, string_from(ctx, prompt).c_str());
    llama_decode(ctx, batch);
    common_sampler_reset(smpl);
    // sample n_draft tokens from the draft model
    for (int i = 0; i < params.n_draft; ++i) {
        common_batch_clear(batch);
        common_sampler_sample(smpl, ctx, 0, true);
        const auto * cur_p = common_sampler_get_candidates(smpl);
        for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) {
            LOG_DBG(" - draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n",
                    k, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx, cur_p->data[k].id).c_str());
        }
        // add drafted token for each sequence
        const llama_token id = cur_p->data[0].id;
        // only collect very high-confidence draft tokens
        if (cur_p->data[0].p < params.p_min) {
            break;
        }
        common_sampler_accept(smpl, id, true);
        result.push_back(id);
        if (params.n_draft <= (int) result.size()) {
            break;
        }
        common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
        // evaluate the drafted tokens on the draft model
        llama_decode(ctx, batch);
        prompt.push_back(id);
    }
    return result;
 }
--- a/common/speculative.h
+++ b/common/speculative.h
@ -0,0 +1,28 @@
 #pragma once
 #include "llama.h"
 #include "common.h"
 struct common_speculative;
 struct common_speculative_params {
    int n_draft = 16;  // max drafted tokens
    int n_reuse = 256;
    float p_min = 0.9f; // min probabiliy required to accept a token in the draft
 };
 struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
 void common_speculative_free(struct common_speculative * spec);
 bool common_speculative_are_compatible(
        const struct llama_context * ctx_tgt,
        const struct llama_context * ctx_dft);
 // sample up to n_draft tokens and add them to the batch using the draft model
 llama_tokens common_speculative_gen_draft(
               struct common_speculative * spec,
        struct common_speculative_params   params,
                      const llama_tokens & prompt,
                             llama_token   id_last);
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -2707,7 +2707,7 @@ class XLMRobertaModel(BertModel):
        self.gguf_writer.add_token_scores(scores)
        self.gguf_writer.add_token_types(toktypes)
        self.gguf_writer.add_add_space_prefix(add_prefix)
-        self.gguf_writer.add_token_type_count(1)
+        self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
        self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
        if precompiled_charsmap:
            self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
@ -3040,6 +3040,11 @@ class OlmoModel(Model):
        return [(self.map_tensor_name(name), data_torch)]
@Model.register("Olmo2ForCausalLM")
 class Olmo2Model(Model):
    model_arch = gguf.MODEL_ARCH.OLMO2
@Model.register("OlmoeForCausalLM")
 class OlmoeModel(Model):
    model_arch = gguf.MODEL_ARCH.OLMOE
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@ -34,13 +34,16 @@ The SYCL backend would be broken by some PRs due to no online CI.
 The following release is verified with good quality:
-|Commit ID|Tag|Release|Verified  Platform|
+|Commit ID|Tag|Release|Verified  Platform| Update date|
-|-|-|-|-|
+|-|-|-|-|-|
-|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1|
+|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19|
 |fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1||
 ## News
 - 2024.11
  - Use syclcompat to improve the performance on some platforms. This requires to use oneAPI 2025.0 or newer.
 - 2024.8
  - Use oneDNN as the default GEMM library, improve the compatibility for new Intel GPUs.
@ -310,12 +313,14 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_
 export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
 # Build LLAMA with Nvidia BLAS acceleration through SYCL
 # Setting GGML_SYCL_DEVICE_ARCH is optional but can improve performance
 GGML_SYCL_DEVICE_ARCH=sm_80 # Example architecture
 # Option 1: Use FP32 (recommended for better performance in most cases)
-cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 # Option 2: Use FP16
-cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
+cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
 # build all binary
 cmake --build build --config Release -j -v
@ -333,8 +338,9 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithrocBLAS/include:$CPLUS_INCLUDE
 ## AMD
 # Use FP32, FP16 is not supported
-# Find your GGML_SYCL_HIP_TARGET with rocminfo, under the key 'Name:'
+# Find your GGML_SYCL_DEVICE_ARCH with rocminfo, under the key 'Name:'
-cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=AMD -DGGML_SYCL_HIP_TARGET=${GGML_SYCL_HIP_TARGET} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+GGML_SYCL_DEVICE_ARCH=gfx90a # Example architecture
 cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=AMD -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 # build all binary
 cmake --build build --config Release -j -v
@ -644,6 +650,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 |--------------------|---------------------------------------|---------------------------------------------|
 | GGML_SYCL          | ON (mandatory)                        | Enable build with SYCL code path.<br>FP32 path - recommended for better perforemance than FP16 on quantized model|
 | GGML_SYCL_TARGET   | INTEL *(default)* \| NVIDIA \| AMD    | Set the SYCL target device type.            |
 | GGML_SYCL_DEVICE_ARCH | Optional (except for AMD)          | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
 | GGML_SYCL_F16      | OFF *(default)* \|ON *(optional)*     | Enable FP16 build with SYCL code path.      |
 | CMAKE_C_COMPILER   | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path.      |
 | CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)*   | Set `icpx/icx` compiler for SYCL code path. |
--- a/docs/build.md
+++ b/docs/build.md
@ -186,13 +186,9 @@ The following compilation options are also available to tweak performance:
 | Option                        | Legal values           | Default | Description                                                                                                                                                                                                                                                                             |
 |-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | GGML_CUDA_FORCE_DMMV          | Boolean                | false   | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
 | GGML_CUDA_DMMV_X              | Positive integer >= 32 | 32      | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants.                                         |
 | GGML_CUDA_MMV_Y               | Positive integer       | 1       | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended.                                                                                                                                         |
 | GGML_CUDA_FORCE_MMQ           | Boolean                | false   | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower.                       |
 | GGML_CUDA_FORCE_CUBLAS        | Boolean                | false   | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models                                                                                                                                                                                       |
 | GGML_CUDA_F16                 | Boolean                | false   | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs.                                                           |
 | GGML_CUDA_KQUANTS_ITER        | 1 or 2                 | 2       | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                                                                     |
 | GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer       | 128     | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial.                                                                         |
 | GGML_CUDA_FA_ALL_QUANTS       | Boolean                | false   | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer.                                                                                                  |
@ -230,7 +226,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm
 - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
  ```bash
  HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
-      cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
+      cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
      && cmake --build build --config Release -- -j 16
  ```
  On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
@ -247,7 +243,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm
  ```bash
  HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
  HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
-      cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
+      cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
      && cmake --build build -- -j 16
  ```
@ -259,7 +255,7 @@ You can download it from your Linux distro's package manager or from here: [ROCm
 - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
  ```bash
  set PATH=%HIP_PATH%\bin;%PATH%
-  cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
+  cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIP=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
  cmake --build build
  ```
  Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
@ -268,13 +264,6 @@ You can download it from your Linux distro's package manager or from here: [ROCm
 The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
 If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
 The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
 | Option                 | Legal values           | Default | Description                                                                                                                                                                                                                                    |
 |------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | GGML_CUDA_DMMV_X       | Positive integer >= 32 | 32      | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
 | GGML_CUDA_MMV_Y        | Positive integer       | 1       | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants.                                                                       |
 | GGML_CUDA_KQUANTS_ITER | 1 or 2                 | 2       | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs.                                                                             |
 ### Vulkan
@ -282,9 +271,9 @@ The following compilation options are also available to tweak performance (yes,
 #### w64devkit
-Download and extract [w64devkit](https://github.com/skeeto/w64devkit/releases).
+Download and extract [`w64devkit`](https://github.com/skeeto/w64devkit/releases).
-Download and install the [Vulkan SDK](https://vulkan.lunarg.com/sdk/home#windows). When selecting components, only the Vulkan SDK Core is required.
+Download and install the [`Vulkan SDK`](https://vulkan.lunarg.com/sdk/home#windows) with the default settings.
 Launch `w64devkit.exe` and run the following commands to copy Vulkan dependencies:
 ```sh
@ -302,6 +291,29 @@ EOF
 ```
 Switch into the `llama.cpp` directory and run `make GGML_VULKAN=1`.
 #### Git Bash MINGW64
 Download and install [`Git-SCM`](https://git-scm.com/downloads/win) with the default settings
 Download and install [`Visual Studio Community Edition`](https://visualstudio.microsoft.com/) and make sure you select `C++`
 Download and install [`CMake`](https://cmake.org/download/) with the default settings
 Download and install the [`Vulkan SDK`](https://vulkan.lunarg.com/sdk/home#windows) with the default settings.
 Go into your `llama.cpp` directory and right click, select `Open Git Bash Here` and then run the following commands
 ```
 cmake -B build -DGGML_VULKAN=ON
 cmake --build build --config Release
 ```
 Now you can load the model in conversation mode using `Vulkan`
 ```
 build/bin/release/llama-cli -m "[PATH TO MODEL]" -ngl 100 -c 16384 -t 10 -n -2 -cnv
 ```
 #### MSYS2
 Install [MSYS2](https://www.msys2.org/) and then run the following commands in a UCRT terminal to install dependencies.
  ```sh
@ -375,7 +387,7 @@ cmake --build build --config release
 You can test with:
-`./build/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32`
+`./build/bin/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32`
 If the fllowing info is output on screen, you are using `llama.cpp by CANN backend`:
 ```bash
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -12,13 +12,10 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 if (EMSCRIPTEN)
 else()
    add_subdirectory(cvector-generator)
    add_subdirectory(batched-bench)
    add_subdirectory(batched)
    add_subdirectory(convert-llama2c-to-ggml)
    add_subdirectory(embedding)
    add_subdirectory(eval-callback)
    add_subdirectory(export-lora)
    add_subdirectory(gbnf-validator)
    add_subdirectory(gguf-hash)
    add_subdirectory(gguf-split)
@ -27,28 +24,36 @@ else()
    add_subdirectory(imatrix)
    add_subdirectory(infill)
    add_subdirectory(llama-bench)
    add_subdirectory(llava)
    add_subdirectory(lookahead)
    add_subdirectory(lookup)
    add_subdirectory(main)
    add_subdirectory(parallel)
    add_subdirectory(passkey)
    add_subdirectory(perplexity)
    add_subdirectory(quantize-stats)
    add_subdirectory(quantize)
    add_subdirectory(retrieval)
    if (GGML_RPC)
        add_subdirectory(rpc)
    endif()
    if (LLAMA_BUILD_SERVER)
-    add_subdirectory(server)
+        add_subdirectory(server)
    endif()
    if (GGML_SYCL)
        add_subdirectory(sycl)
    endif()
    add_subdirectory(save-load-state)
    add_subdirectory(run)
    add_subdirectory(simple)
    add_subdirectory(simple-chat)
    add_subdirectory(speculative)
    add_subdirectory(speculative-simple)
    add_subdirectory(tokenize)
    if (NOT GGML_BACKEND_DL)
        # these examples use the backends directly and cannot be built with dynamic loading
        add_subdirectory(convert-llama2c-to-ggml)
        add_subdirectory(cvector-generator)
        add_subdirectory(export-lora)
        add_subdirectory(quantize-stats)
        add_subdirectory(llava)
        if (GGML_RPC)
            add_subdirectory(rpc)
        endif()
        if (GGML_SYCL)
            add_subdirectory(sycl)
        endif()
    endif()
 endif()
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@ -68,10 +68,10 @@ int main(int argc, char ** argv) {
    llama_sampler * smpl = llama_sampler_chain_init(sparams);
-    llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sparams.top_k));
+    llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sampling.top_k));
-    llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sparams.top_p, params.sparams.min_keep));
+    llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sampling.top_p, params.sampling.min_keep));
-    llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sparams.temp));
+    llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sampling.temp));
-    llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed));
+    llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sampling.seed));
    if (ctx == NULL) {
        LOG_ERR("%s: error: failed to create the llama_context\n" , __func__);
--- a/examples/chat-persistent.sh
+++ b/examples/chat-persistent.sh
@ -23,8 +23,9 @@ CUR_PROMPT_CACHE="${CHAT_SAVE_DIR}/current-cache.bin"
 NEXT_PROMPT_FILE="${CHAT_SAVE_DIR}/next-prompt.txt"
 NEXT_PROMPT_CACHE="${CHAT_SAVE_DIR}/next-cache.bin"
-SESSION_SIZE_MSG_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'
+SESSION_AND_SAMPLE_PATTERN='main: session file matches [[:digit:]]+ / [[:digit:]]+'\
-SAMPLE_TIME_MSG_PATTERN='sample time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
+'|'\
 'sampling time =[[:space:]]+[[:digit:]]+.[[:digit:]]+ ms /[[:space:]]+[[:digit:]]+'
 SED_DELETE_MESSAGES="/^(${USER_NAME}:|${AI_NAME}:|\\.\\.\\.)/,\$d"
 CTX_SIZE=2048
@ -129,15 +130,12 @@ while read -e line; do
    printf ' '
-    # HACK get num tokens from debug message
+    if ! session_and_sample_msg=$(tail -n30 "$LOG" | grep -oE "$SESSION_AND_SAMPLE_PATTERN"); then
    # TODO get both messages in one go
    if  ! session_size_msg="$(tail -n30 "$LOG" | grep -oE "$SESSION_SIZE_MSG_PATTERN")" ||
        ! sample_time_msg="$(tail -n10 "$LOG" | grep -oE "$SAMPLE_TIME_MSG_PATTERN")"; then
        echo >&2 "Couldn't get number of tokens from ./llama-cli output!"
        exit 1
    fi
-    n_tokens=$(($(cut -d/ -f2 <<<"$session_size_msg") + $(cut -d/ -f2 <<<"$sample_time_msg")))
+    n_tokens=$(awk '{sum+=$1} END {print sum}' <<< "$(cut -d/ -f2 <<< "$session_and_sample_msg")")
    if ((n_tokens > CTX_ROTATE_POINT)); then
        tail -c+$((n_prompt_len_pre + 1)) "$CUR_PROMPT_FILE" >>"$NEXT_PROMPT_FILE"
--- a/examples/convert_legacy_llama.py
+++ b/examples/convert_legacy_llama.py
@ -840,6 +840,8 @@ class OutputFile:
                        self.gguf.add_base_model_version(key, base_model_entry["version"])
                    if "organization" in base_model_entry:
                        self.gguf.add_base_model_organization(key, base_model_entry["organization"])
                    if "description" in base_model_entry:
                        self.gguf.add_base_model_description(key, base_model_entry["description"])
                    if "url" in base_model_entry:
                        self.gguf.add_base_model_url(key, base_model_entry["url"])
                    if "doi" in base_model_entry:
@ -849,12 +851,32 @@ class OutputFile:
                    if "repo_url" in base_model_entry:
                        self.gguf.add_base_model_repo_url(key, base_model_entry["repo_url"])
            if metadata.datasets is not None:
                self.gguf.add_dataset_count(len(metadata.datasets))
                for key, dataset_entry in enumerate(metadata.datasets):
                    if "name" in dataset_entry:
                        self.gguf.add_dataset_name(key, dataset_entry["name"])
                    if "author" in dataset_entry:
                        self.gguf.add_dataset_author(key, dataset_entry["author"])
                    if "version" in dataset_entry:
                        self.gguf.add_dataset_version(key, dataset_entry["version"])
                    if "organization" in dataset_entry:
                        self.gguf.add_dataset_organization(key, dataset_entry["organization"])
                    if "description" in dataset_entry:
                        self.gguf.add_dataset_description(key, dataset_entry["description"])
                    if "url" in dataset_entry:
                        self.gguf.add_dataset_url(key, dataset_entry["url"])
                    if "doi" in dataset_entry:
                        self.gguf.add_dataset_doi(key, dataset_entry["doi"])
                    if "uuid" in dataset_entry:
                        self.gguf.add_dataset_uuid(key, dataset_entry["uuid"])
                    if "repo_url" in dataset_entry:
                        self.gguf.add_dataset_repo_url(key, dataset_entry["repo_url"])
            if metadata.tags is not None:
                self.gguf.add_tags(metadata.tags)
            if metadata.languages is not None:
                self.gguf.add_languages(metadata.languages)
            if metadata.datasets is not None:
                self.gguf.add_datasets(metadata.datasets)
    def add_meta_arch(self, params: Params) -> None:
        # Metadata About The Neural Architecture Itself
--- a/examples/eval-callback/CMakeLists.txt
+++ b/examples/eval-callback/CMakeLists.txt
@ -5,5 +5,6 @@ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 set(TEST_TARGET test-eval-callback)
-add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
+add_test(NAME ${TEST_TARGET}
        COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0)
 set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@ -43,50 +43,6 @@ static std::vector<llama_token> * g_output_tokens;
 static bool is_interacting = false;
 static void write_logfile(
    const llama_context * ctx, const common_params & params, const llama_model * model,
    const std::vector<llama_token> & input_tokens, const std::string & output,
    const std::vector<llama_token> & output_tokens
 ) {
    if (params.logdir.empty()) {
        return;
    }
    const std::string timestamp = string_get_sortable_timestamp();
    const bool success = fs_create_directory_with_parents(params.logdir);
    if (!success) {
        LOG_ERR("%s: warning: failed to create logdir %s, cannot write logfile\n",
                __func__, params.logdir.c_str());
        return;
    }
    const std::string logfile_path = params.logdir + timestamp + ".yml";
    FILE * logfile = fopen(logfile_path.c_str(), "w");
    if (logfile == NULL) {
        LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
        return;
    }
    fprintf(logfile, "binary: infill\n");
    char model_desc[128];
    llama_model_desc(model, model_desc, sizeof(model_desc));
    yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
    fprintf(logfile, "\n");
    fprintf(logfile, "######################\n");
    fprintf(logfile, "# Generation Results #\n");
    fprintf(logfile, "######################\n");
    fprintf(logfile, "\n");
    yaml_dump_string_multiline(logfile, "output", output.c_str());
    yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
    llama_perf_dump_yaml(logfile, ctx);
    fclose(logfile);
 }
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
 static void sigint_handler(int signo) {
    if (signo == SIGINT) {
@ -96,7 +52,6 @@ static void sigint_handler(int signo) {
            console::cleanup();
            LOG("\n");
            common_perf_print(*g_ctx, *g_smpl);
            write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
            // make sure all logs are flushed
            LOG("Interrupted by user\n");
@ -118,7 +73,7 @@ int main(int argc, char ** argv) {
    common_init();
-    auto & sparams = params.sparams;
+    auto & sparams = params.sampling;
    console::init(params.simple_io, params.use_color);
    atexit([]() { console::cleanup(); });
@ -625,7 +580,6 @@ int main(int argc, char ** argv) {
    LOG("\n");
    common_perf_print(ctx, smpl);
    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
    llama_free(ctx);
    llama_free_model(model);
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@ -191,7 +191,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
    LOG("\n");
-    struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sparams);
+    struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling);
    if (!smpl) {
        LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
        exit(1);
--- a/examples/llava/minicpmv-cli.cpp
+++ b/examples/llava/minicpmv-cli.cpp
@ -237,7 +237,7 @@ static struct common_sampler * llama_init(struct llava_context * ctx_llava, comm
    LOG_INF("\n");
-    struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sparams);
+    struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling);
    return smpl;
 }
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@ -115,7 +115,7 @@ int main(int argc, char ** argv) {
    llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
    // target model sampling context
-    struct common_sampler * smpl = common_sampler_init(model, params.sparams);
+    struct common_sampler * smpl = common_sampler_init(model, params.sampling);
    // verification n-grams
    std::vector<ngram_data> ngrams_cur(G);
--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@ -21,7 +21,7 @@ int main(int argc, char ** argv){
    common_init();
-    const int n_draft = params.n_draft;
+    const int n_draft = params.speculative.n_max;
    // init llama.cpp
    llama_backend_init();
@ -40,6 +40,7 @@ int main(int argc, char ** argv){
    common_ngram_cache ngram_cache_context;
    common_ngram_cache ngram_cache_dynamic;
    common_ngram_cache ngram_cache_static;
    int64_t t_draft_flat_us = 0;
    int64_t t_draft_us = 0;
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@ -22,7 +22,7 @@ int main(int argc, char ** argv){
    common_init();
    // max. number of additional tokens to draft if match is found
-    const int n_draft = params.n_draft;
+    const int n_draft = params.speculative.n_max;
    const bool dump_kv_cache = params.dump_kv_cache;
@ -102,7 +102,7 @@ int main(int argc, char ** argv){
    bool has_eos = false;
-    struct common_sampler * smpl = common_sampler_init(model, params.sparams);
+    struct common_sampler * smpl = common_sampler_init(model, params.sampling);
    std::vector<llama_token> draft;
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -62,49 +62,6 @@ static bool file_is_empty(const std::string & path) {
    return f.tellg() == 0;
 }
 static void write_logfile(
    const llama_context * ctx, const common_params & params, const llama_model * model,
    const std::vector<llama_token> & input_tokens, const std::string & output,
    const std::vector<llama_token> & output_tokens
 ) {
    if (params.logdir.empty()) {
        return;
    }
    const std::string timestamp = string_get_sortable_timestamp();
    const bool success = fs_create_directory_with_parents(params.logdir);
    if (!success) {
        LOG_ERR("%s: failed to create logdir %s, cannot write logfile\n", __func__, params.logdir.c_str());
        return;
    }
    const std::string logfile_path = params.logdir + timestamp + ".yml";
    FILE * logfile = fopen(logfile_path.c_str(), "w");
    if (logfile == NULL) {
        LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
        return;
    }
    fprintf(logfile, "binary: main\n");
    char model_desc[128];
    llama_model_desc(model, model_desc, sizeof(model_desc));
    yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
    fprintf(logfile, "\n");
    fprintf(logfile, "######################\n");
    fprintf(logfile, "# Generation Results #\n");
    fprintf(logfile, "######################\n");
    fprintf(logfile, "\n");
    yaml_dump_string_multiline(logfile, "output", output.c_str());
    yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
    llama_perf_dump_yaml(logfile, ctx);
    fclose(logfile);
 }
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
 static void sigint_handler(int signo) {
    if (signo == SIGINT) {
@ -115,7 +72,6 @@ static void sigint_handler(int signo) {
            console::cleanup();
            LOG("\n");
            common_perf_print(*g_ctx, *g_smpl);
            write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
            // make sure all logs are flushed
            LOG("Interrupted by user\n");
@ -144,7 +100,7 @@ int main(int argc, char ** argv) {
    common_init();
-    auto & sparams = params.sparams;
+    auto & sparams = params.sampling;
    // save choice to use color for later
    // (note for later: this is a slightly awkward choice)
@ -209,6 +165,10 @@ int main(int argc, char ** argv) {
    LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);
    auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
    auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_new");
    auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_free");
    struct ggml_threadpool_params tpp_batch =
            ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
    struct ggml_threadpool_params tpp =
@ -218,7 +178,7 @@ int main(int argc, char ** argv) {
    struct ggml_threadpool * threadpool_batch = NULL;
    if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
-        threadpool_batch = ggml_threadpool_new(&tpp_batch);
+        threadpool_batch = ggml_threadpool_new_fn(&tpp_batch);
        if (!threadpool_batch) {
            LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
            return 1;
@ -228,7 +188,7 @@ int main(int argc, char ** argv) {
        tpp.paused = true;
    }
-    struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
+    struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp);
    if (!threadpool) {
        LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
        return 1;
@ -926,7 +886,6 @@ int main(int argc, char ** argv) {
    LOG("\n\n");
    common_perf_print(ctx, smpl);
    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
    common_sampler_free(smpl);
@ -935,8 +894,8 @@ int main(int argc, char ** argv) {
    llama_backend_free();
-    ggml_threadpool_free(threadpool);
+    ggml_threadpool_free_fn(threadpool);
-    ggml_threadpool_free(threadpool_batch);
+    ggml_threadpool_free_fn(threadpool_batch);
    return 0;
 }
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@ -160,7 +160,7 @@ int main(int argc, char ** argv) {
    for (size_t i = 0; i < clients.size(); ++i) {
        auto & client = clients[i];
        client.id = i;
-        client.smpl = common_sampler_init(model, params.sparams);
+        client.smpl = common_sampler_init(model, params.sampling);
    }
    std::vector<llama_token> tokens_system;
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -34,55 +34,6 @@ struct results_log_softmax {
    float  prob;
 };
 static void write_logfile(
    const llama_context * ctx, const common_params & params, const llama_model * model,
    const struct results_perplexity & results
 ) {
    if (params.logdir.empty()) {
        return;
    }
    if (params.hellaswag) {
        LOG_WRN("%s: logging results is not implemented for HellaSwag. No files will be written.\n", __func__);
        return;
    }
    const std::string timestamp = string_get_sortable_timestamp();
    const bool success = fs_create_directory_with_parents(params.logdir);
    if (!success) {
        LOG_WRN("%s: failed to create logdir %s, cannot write logfile\n",
                __func__, params.logdir.c_str());
        return;
    }
    const std::string logfile_path = params.logdir + timestamp + ".yml";
    FILE * logfile = fopen(logfile_path.c_str(), "w");
    if (logfile == NULL) {
        LOG_ERR("%s: failed to open logfile %s\n", __func__, logfile_path.c_str());
        return;
    }
    fprintf(logfile, "binary: main\n");
    char model_desc[128];
    llama_model_desc(model, model_desc, sizeof(model_desc));
    yaml_dump_non_result_info(logfile, params, ctx, timestamp, results.tokens, model_desc);
    fprintf(logfile, "\n");
    fprintf(logfile, "######################\n");
    fprintf(logfile, "# Perplexity Results #\n");
    fprintf(logfile, "######################\n");
    fprintf(logfile, "\n");
    yaml_dump_vector_float(logfile, "logits", results.logits);
    fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
    yaml_dump_vector_float(logfile, "probs", results.probs);
    llama_perf_dump_yaml(logfile, ctx);
    fclose(logfile);
 }
 static std::vector<float> softmax(const std::vector<float>& logits) {
    std::vector<float> probs(logits.size());
    float max_logit = logits[0];
@ -2072,8 +2023,6 @@ int main(int argc, char ** argv) {
    LOG("\n");
    llama_perf_context_print(ctx);
    write_logfile(ctx, params, model, results);
    llama_free(ctx);
    llama_free_model(model);
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@ -142,7 +142,7 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
 }
 static void test_roundtrip_on_chunk(
-    const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits & qfns, bool use_reference,
+    const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference,
    float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats
 ) {
    if (layer->type == GGML_TYPE_F16) {
@ -156,7 +156,7 @@ static void test_roundtrip_on_chunk(
    if (use_reference) {
        qfns.from_float_ref(input_scratch, quantized_scratch, chunk_size);
    } else {
-        qfns.from_float(input_scratch, quantized_scratch, chunk_size);
+        qfns_cpu.from_float(input_scratch, quantized_scratch, chunk_size);
    }
    qfns.to_float(quantized_scratch, output_scratch, chunk_size);
@ -166,7 +166,7 @@ static void test_roundtrip_on_chunk(
 // Run quantization function for a single layer and update error stats
 static void test_roundtrip_on_layer(
-    std::string & name, bool print_layer_stats, const ggml_type_traits & qfns, bool use_reference,
+    std::string & name, bool print_layer_stats, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference,
    const ggml_tensor * layer, std::vector<float> & input_scratch, std::vector<char> & quantized_scratch,
    std::vector<float> & output_scratch, error_stats & total_error, int max_thread = 0
 ) {
@ -187,13 +187,13 @@ static void test_roundtrip_on_layer(
    int num_chunks = (nelements + chunk_size - 1)/chunk_size;
    if (num_chunks < 2 || max_thread < 2) {
-        test_roundtrip_on_chunk(layer, 0, nelements, qfns, use_reference, input_scratch_ptr, quantized_scratch.data(),
+        test_roundtrip_on_chunk(layer, 0, nelements, qfns, qfns_cpu, use_reference, input_scratch_ptr, quantized_scratch.data(),
                output_scratch.data(), print_layer_stats ? layer_error : total_error);
    } else {
        auto & stats = print_layer_stats ? layer_error : total_error;
        std::mutex mutex;
        uint64_t counter = 0;
-        auto compute = [&mutex, &counter, &stats, &qfns, nelements, layer, use_reference, input_scratch_ptr,
+        auto compute = [&mutex, &counter, &stats, &qfns, &qfns_cpu, nelements, layer, use_reference, input_scratch_ptr,
             &quantized_scratch, &output_scratch, chunk_size] () {
            error_stats local_stats {};
            while (true) {
@ -205,7 +205,7 @@ static void test_roundtrip_on_layer(
                }
                lock.unlock();
                uint64_t chunk = offset + chunk_size < nelements ? chunk_size : nelements - offset;
-                test_roundtrip_on_chunk(layer, offset, chunk, qfns, use_reference, input_scratch_ptr + offset,
+                test_roundtrip_on_chunk(layer, offset, chunk, qfns, qfns_cpu, use_reference, input_scratch_ptr + offset,
                        quantized_scratch.data() + 4*offset, output_scratch.data() + offset, local_stats);
            }
        };
@ -371,8 +371,9 @@ int main(int argc, char ** argv) {
        if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
            continue;
        }
-        const auto *  qfns = ggml_get_type_traits(type);
+        const auto *  qfns     = ggml_get_type_traits(type);
-        if (qfns->from_float && qfns->to_float) {
+        const auto *  qfns_cpu = ggml_get_type_traits_cpu(type);
        if (qfns_cpu->from_float && qfns->to_float) {
            if (params.verbose) {
                printf("testing %s ...\n",  ggml_type_name(type));
            }
@ -393,7 +394,7 @@ int main(int argc, char ** argv) {
                test_roundtrip_on_layer(
                        layer_name,
                        params.per_layer_stats,
-                        *qfns,
+                        *qfns, *qfns_cpu,
                        params.reference,
                        kv_tensor.second,
                        input_scratch,
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@ -282,8 +282,8 @@ int main(int argc, char ** argv) {
                return a.second > b.second;
            });
-            LOG("Top %d similar chunks:\n", params.sparams.top_k);
+            LOG("Top %d similar chunks:\n", params.sampling.top_k);
-            for (int i = 0; i < std::min(params.sparams.top_k, (int) chunks.size()); i++) {
+            for (int i = 0; i < std::min(params.sampling.top_k, (int) chunks.size()); i++) {
                LOG("filename: %s\n", chunks[similarities[i].first].filename.c_str());
                LOG("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
                LOG("similarity: %f\n", similarities[i].second);
--- a/examples/run/CMakeLists.txt
+++ b/examples/run/CMakeLists.txt
@ -0,0 +1,5 @@
 set(TARGET llama-run)
 add_executable(${TARGET} run.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/run/README.md
+++ b/examples/run/README.md
@ -0,0 +1,7 @@
 # llama.cpp/example/run
 The purpose of this example is to demonstrate a minimal usage of llama.cpp for running models.
 ```bash
 ./llama-run Meta-Llama-3.1-8B-Instruct.gguf
 ...
--- a/examples/run/run.cpp
+++ b/examples/run/run.cpp
@ -0,0 +1,409 @@
 #if defined(_WIN32)
 #include <windows.h>
 #else
 #include <unistd.h>
 #endif
 #include <climits>
 #include <cstdio>
 #include <cstring>
 #include <iostream>
 #include <sstream>
 #include <string>
 #include <unordered_map>
 #include <vector>
 #include "llama-cpp.h"
 typedef std::unique_ptr<char[]> char_array_ptr;
 struct Argument {
    std::string flag;
    std::string help_text;
 };
 struct Options {
    std::string model_path, prompt_non_interactive;
    int ngl = 99;
    int n_ctx = 2048;
 };
 class ArgumentParser {
   public:
    ArgumentParser(const char * program_name) : program_name(program_name) {}
    void add_argument(const std::string & flag, std::string & var, const std::string & help_text = "") {
        string_args[flag] = &var;
        arguments.push_back({flag, help_text});
    }
    void add_argument(const std::string & flag, int & var, const std::string & help_text = "") {
        int_args[flag] = &var;
        arguments.push_back({flag, help_text});
    }
    int parse(int argc, const char ** argv) {
        for (int i = 1; i < argc; ++i) {
            std::string arg = argv[i];
            if (string_args.count(arg)) {
                if (i + 1 < argc) {
                    *string_args[arg] = argv[++i];
                } else {
                    fprintf(stderr, "error: missing value for %s\n", arg.c_str());
                    print_usage();
                    return 1;
                }
            } else if (int_args.count(arg)) {
                if (i + 1 < argc) {
                    if (parse_int_arg(argv[++i], *int_args[arg]) != 0) {
                        fprintf(stderr, "error: invalid value for %s: %s\n", arg.c_str(), argv[i]);
                        print_usage();
                        return 1;
                    }
                } else {
                    fprintf(stderr, "error: missing value for %s\n", arg.c_str());
                    print_usage();
                    return 1;
                }
            } else {
                fprintf(stderr, "error: unrecognized argument %s\n", arg.c_str());
                print_usage();
                return 1;
            }
        }
        if (string_args["-m"]->empty()) {
            fprintf(stderr, "error: -m is required\n");
            print_usage();
            return 1;
        }
        return 0;
    }
   private:
    const char * program_name;
    std::unordered_map<std::string, std::string *> string_args;
    std::unordered_map<std::string, int *> int_args;
    std::vector<Argument> arguments;
    int parse_int_arg(const char * arg, int & value) {
        char * end;
        const long val = std::strtol(arg, &end, 10);
        if (*end == '\0' && val >= INT_MIN && val <= INT_MAX) {
            value = static_cast<int>(val);
            return 0;
        }
        return 1;
    }
    void print_usage() const {
        printf("\nUsage:\n");
        printf("  %s [OPTIONS]\n\n", program_name);
        printf("Options:\n");
        for (const auto & arg : arguments) {
            printf("  %-10s %s\n", arg.flag.c_str(), arg.help_text.c_str());
        }
        printf("\n");
    }
 };
 class LlamaData {
   public:
    llama_model_ptr model;
    llama_sampler_ptr sampler;
    llama_context_ptr context;
    std::vector<llama_chat_message> messages;
    int init(const Options & opt) {
        model = initialize_model(opt.model_path, opt.ngl);
        if (!model) {
            return 1;
        }
        context = initialize_context(model, opt.n_ctx);
        if (!context) {
            return 1;
        }
        sampler = initialize_sampler();
        return 0;
    }
   private:
    // Initializes the model and returns a unique pointer to it
    llama_model_ptr initialize_model(const std::string & model_path, const int ngl) {
        llama_model_params model_params = llama_model_default_params();
        model_params.n_gpu_layers = ngl;
        llama_model_ptr model(llama_load_model_from_file(model_path.c_str(), model_params));
        if (!model) {
            fprintf(stderr, "%s: error: unable to load model\n", __func__);
        }
        return model;
    }
    // Initializes the context with the specified parameters
    llama_context_ptr initialize_context(const llama_model_ptr & model, const int n_ctx) {
        llama_context_params ctx_params = llama_context_default_params();
        ctx_params.n_ctx = n_ctx;
        ctx_params.n_batch = n_ctx;
        llama_context_ptr context(llama_new_context_with_model(model.get(), ctx_params));
        if (!context) {
            fprintf(stderr, "%s: error: failed to create the llama_context\n", __func__);
        }
        return context;
    }
    // Initializes and configures the sampler
    llama_sampler_ptr initialize_sampler() {
        llama_sampler_ptr sampler(llama_sampler_chain_init(llama_sampler_chain_default_params()));
        llama_sampler_chain_add(sampler.get(), llama_sampler_init_min_p(0.05f, 1));
        llama_sampler_chain_add(sampler.get(), llama_sampler_init_temp(0.8f));
        llama_sampler_chain_add(sampler.get(), llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
        return sampler;
    }
 };
 // Add a message to `messages` and store its content in `owned_content`
 static void add_message(const char * role, const std::string & text, LlamaData & llama_data,
                        std::vector<char_array_ptr> & owned_content) {
    char_array_ptr content(new char[text.size() + 1]);
    std::strcpy(content.get(), text.c_str());
    llama_data.messages.push_back({role, content.get()});
    owned_content.push_back(std::move(content));
 }
 // Function to apply the chat template and resize `formatted` if needed
 static int apply_chat_template(const LlamaData & llama_data, std::vector<char> & formatted, const bool append) {
    int result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(),
                                           llama_data.messages.size(), append, formatted.data(), formatted.size());
    if (result > static_cast<int>(formatted.size())) {
        formatted.resize(result);
        result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(),
                                           llama_data.messages.size(), append, formatted.data(), formatted.size());
    }
    return result;
 }
 // Function to tokenize the prompt
 static int tokenize_prompt(const llama_model_ptr & model, const std::string & prompt,
                           std::vector<llama_token> & prompt_tokens) {
    const int n_prompt_tokens = -llama_tokenize(model.get(), prompt.c_str(), prompt.size(), NULL, 0, true, true);
    prompt_tokens.resize(n_prompt_tokens);
    if (llama_tokenize(model.get(), prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true,
                       true) < 0) {
        GGML_ABORT("failed to tokenize the prompt\n");
    }
    return n_prompt_tokens;
 }
 // Check if we have enough space in the context to evaluate this batch
 static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) {
    const int n_ctx = llama_n_ctx(ctx.get());
    const int n_ctx_used = llama_get_kv_cache_used_cells(ctx.get());
    if (n_ctx_used + batch.n_tokens > n_ctx) {
        printf("\033[0m\n");
        fprintf(stderr, "context size exceeded\n");
        return 1;
    }
    return 0;
 }
 // convert the token to a string
 static int convert_token_to_string(const llama_model_ptr & model, const llama_token token_id, std::string & piece) {
    char buf[256];
    int n = llama_token_to_piece(model.get(), token_id, buf, sizeof(buf), 0, true);
    if (n < 0) {
        GGML_ABORT("failed to convert token to piece\n");
    }
    piece = std::string(buf, n);
    return 0;
 }
 static void print_word_and_concatenate_to_response(const std::string & piece, std::string & response) {
    printf("%s", piece.c_str());
    fflush(stdout);
    response += piece;
 }
 // helper function to evaluate a prompt and generate a response
 static int generate(LlamaData & llama_data, const std::string & prompt, std::string & response) {
    std::vector<llama_token> prompt_tokens;
    const int n_prompt_tokens = tokenize_prompt(llama_data.model, prompt, prompt_tokens);
    if (n_prompt_tokens < 0) {
        return 1;
    }
    // prepare a batch for the prompt
    llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
    llama_token new_token_id;
    while (true) {
        check_context_size(llama_data.context, batch);
        if (llama_decode(llama_data.context.get(), batch)) {
            GGML_ABORT("failed to decode\n");
        }
        // sample the next token, check is it an end of generation?
        new_token_id = llama_sampler_sample(llama_data.sampler.get(), llama_data.context.get(), -1);
        if (llama_token_is_eog(llama_data.model.get(), new_token_id)) {
            break;
        }
        std::string piece;
        if (convert_token_to_string(llama_data.model, new_token_id, piece)) {
            return 1;
        }
        print_word_and_concatenate_to_response(piece, response);
        // prepare the next batch with the sampled token
        batch = llama_batch_get_one(&new_token_id, 1);
    }
    return 0;
 }
 static int parse_arguments(const int argc, const char ** argv, Options & opt) {
    ArgumentParser parser(argv[0]);
    parser.add_argument("-m", opt.model_path, "model");
    parser.add_argument("-p", opt.prompt_non_interactive, "prompt");
    parser.add_argument("-c", opt.n_ctx, "context_size");
    parser.add_argument("-ngl", opt.ngl, "n_gpu_layers");
    if (parser.parse(argc, argv)) {
        return 1;
    }
    return 0;
 }
 static int read_user_input(std::string & user) {
    std::getline(std::cin, user);
    return user.empty();  // Indicate an error or empty input
 }
 // Function to generate a response based on the prompt
 static int generate_response(LlamaData & llama_data, const std::string & prompt, std::string & response) {
    // Set response color
    printf("\033[33m");
    if (generate(llama_data, prompt, response)) {
        fprintf(stderr, "failed to generate response\n");
        return 1;
    }
    // End response with color reset and newline
    printf("\n\033[0m");
    return 0;
 }
 // Helper function to apply the chat template and handle errors
 static int apply_chat_template_with_error_handling(const LlamaData & llama_data, std::vector<char> & formatted,
                                                   const bool is_user_input, int & output_length) {
    const int new_len = apply_chat_template(llama_data, formatted, is_user_input);
    if (new_len < 0) {
        fprintf(stderr, "failed to apply the chat template\n");
        return -1;
    }
    output_length = new_len;
    return 0;
 }
 // Helper function to handle user input
 static bool handle_user_input(std::string & user_input, const std::string & prompt_non_interactive) {
    if (!prompt_non_interactive.empty()) {
        user_input = prompt_non_interactive;
        return true;  // No need for interactive input
    }
    printf("\033[32m> \033[0m");
    return !read_user_input(user_input);  // Returns false if input ends the loop
 }
 // Function to tokenize the prompt
 static int chat_loop(LlamaData & llama_data, std::string & prompt_non_interactive) {
    std::vector<char_array_ptr> owned_content;
    std::vector<char> fmtted(llama_n_ctx(llama_data.context.get()));
    int prev_len = 0;
    while (true) {
        // Get user input
        std::string user_input;
        if (!handle_user_input(user_input, prompt_non_interactive)) {
            break;
        }
        add_message("user", prompt_non_interactive.empty() ? user_input : prompt_non_interactive, llama_data,
                    owned_content);
        int new_len;
        if (apply_chat_template_with_error_handling(llama_data, fmtted, true, new_len) < 0) {
            return 1;
        }
        std::string prompt(fmtted.begin() + prev_len, fmtted.begin() + new_len);
        std::string response;
        if (generate_response(llama_data, prompt, response)) {
            return 1;
        }
    }
    return 0;
 }
 static void log_callback(const enum ggml_log_level level, const char * text, void *) {
    if (level == GGML_LOG_LEVEL_ERROR) {
        fprintf(stderr, "%s", text);
    }
 }
 static bool is_stdin_a_terminal() {
 #if defined(_WIN32)
    HANDLE hStdin = GetStdHandle(STD_INPUT_HANDLE);
    DWORD mode;
    return GetConsoleMode(hStdin, &mode);
 #else
    return isatty(STDIN_FILENO);
 #endif
 }
 static std::string read_pipe_data() {
    std::ostringstream result;
    result << std::cin.rdbuf();  // Read all data from std::cin
    return result.str();
 }
 int main(int argc, const char ** argv) {
    Options opt;
    if (parse_arguments(argc, argv, opt)) {
        return 1;
    }
    if (!is_stdin_a_terminal()) {
        if (!opt.prompt_non_interactive.empty()) {
            opt.prompt_non_interactive += "\n\n";
        }
        opt.prompt_non_interactive += read_pipe_data();
    }
    llama_log_set(log_callback, nullptr);
    LlamaData llama_data;
    if (llama_data.init(opt)) {
        return 1;
    }
    if (chat_loop(llama_data, opt.prompt_non_interactive)) {
        return 1;
    }
    return 0;
 }
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@ -9,7 +9,7 @@ int main(int argc, char ** argv) {
    common_params params;
    params.prompt = "The quick brown fox";
-    params.sparams.seed = 1234;
+    params.sampling.seed = 1234;
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
        return 1;
@ -42,7 +42,7 @@ int main(int argc, char ** argv) {
    llama_sampler * smpl = llama_sampler_chain_init(sparams);
-    llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sparams.seed));
+    llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sampling.seed));
    // tokenize prompt
    auto tokens = common_tokenize(ctx, params.prompt, true);
@ -106,7 +106,7 @@ int main(int argc, char ** argv) {
    llama_sampler * smpl2 = llama_sampler_chain_init(sparams);
-    llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sparams.seed));
+    llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sampling.seed));
    printf("\nsecond run: %s", params.prompt.c_str());
@ -169,7 +169,7 @@ int main(int argc, char ** argv) {
    llama_sampler * smpl3 = llama_sampler_chain_init(sparams);
-    llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sparams.seed));
+    llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sampling.seed));
    printf("\nsingle seq run: %s", params.prompt.c_str());
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -39,7 +39,7 @@ The project is under active development, and we are [looking for feedback and co
 | `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) |
 | `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)<br/> |
 | `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
-| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
+| `-c, --ctx-size N` | size of the prompt context (default: 4096, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
 | `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)<br/>(env: LLAMA_ARG_N_PREDICT) |
 | `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
 | `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
@ -64,7 +64,7 @@ The project is under active development, and we are [looking for feedback and co
 | `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) |
 | `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
 | `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
-| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: -1.0, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
+| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: 0.1, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
 | `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
 | `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
 | `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock)<br/>(env: LLAMA_ARG_NO_MMAP) |
@ -85,7 +85,6 @@ The project is under active development, and we are [looking for feedback and co
 | `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
 | `-hff, --hf-file FILE` | Hugging Face model file (default: unused)<br/>(env: LLAMA_ARG_HF_FILE) |
 | `-hft, --hf-token TOKEN` | Hugging Face access token (default: value from HF_TOKEN environment variable)<br/>(env: HF_TOKEN) |
 | `-ld, --logdir LOGDIR` | path under which to save YAML logs (no logging if unset) |
 | `--log-disable` | Log disable |
 | `--log-file FNAME` | Log to file |
 | `--log-colors` | Enable colored logging<br/>(env: LLAMA_LOG_COLORS) |
@ -99,25 +98,27 @@ The project is under active development, and we are [looking for feedback and co
 | Argument | Explanation |
 | -------- | ----------- |
-| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;typ_p;top_p;min_p;temperature) |
+| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: dry;top_k;typ_p;top_p;min_p;xtc;temperature) |
 | `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) |
-| `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: kfypmt) |
+| `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: dkypmxt) |
 | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
 | `--penalize-nl` | penalize newline tokens (default: false) |
 | `--temp N` | temperature (default: 0.8) |
 | `--top-k N` | top-k sampling (default: 40, 0 = disabled) |
 | `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) |
 | `--min-p N` | min-p sampling (default: 0.1, 0.0 = disabled) |
 | `--xtc-probability N` | xtc probability (default: 0.0, 0.0 = disabled) |
 | `--xtc-threshold N` | xtc threshold (default: 0.1, 1.0 = disabled) |
 | `--typical N` | locally typical sampling, parameter p (default: 1.0, 1.0 = disabled) |
 | `--repeat-last-n N` | last n tokens to consider for penalize (default: 64, 0 = disabled, -1 = ctx_size) |
 | `--repeat-penalty N` | penalize repeat sequence of tokens (default: 1.0, 1.0 = disabled) |
 | `--presence-penalty N` | repeat alpha presence penalty (default: 0.0, 0.0 = disabled) |
 | `--frequency-penalty N` | repeat alpha frequency penalty (default: 0.0, 0.0 = disabled) |
-| `--dry-multiplier N` | DRY sampling multiplier (default: 0.0, 0.0 = disabled) |
+| `--dry-multiplier N` | set DRY sampling multiplier (default: 0.0, 0.0 = disabled) |
-| `--dry-base N` | DRY sampling base value (default: 1.75) |
+| `--dry-base N` | set DRY sampling base value (default: 1.75) |
-| `--dry-allowed-length N` | allowed length for DRY sampling (default: 2) |
+| `--dry-allowed-length N` | set allowed length for DRY sampling (default: 2) |
-| `--dry-penalty-last-n N` | DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size) |
+| `--dry-penalty-last-n N` | set DRY penalty for the last n tokens (default: -1, 0 = disable, -1 = context size) |
-| `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers (`['\n', ':', '"', '*']`) in the process; use `"none"` to not use any sequence breakers
+| `--dry-sequence-breaker STRING` | add sequence breaker for DRY sampling, clearing out default breakers ('\n', ':', '"', '*') in the process; use "none" to not use any sequence breakers<br/> |
 | `--dynatemp-range N` | dynamic temperature range (default: 0.0, 0.0 = disabled) |
 | `--dynatemp-exp N` | dynamic temperature exponent (default: 1.0) |
 | `--mirostat N` | use Mirostat sampling.<br/>Top K, Nucleus and Locally Typical samplers are ignored if used.<br/>(default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0) |
@ -381,6 +382,10 @@ node index.js
    `dry_sequence_breakers`: Specify an array of sequence breakers for DRY sampling. Only a JSON array of strings is accepted. Default: `['\n', ':', '"', '*']`
    `xtc_probability`: Set the chance for token removal via XTC sampler. Default: `0.0`, which is disabled.
    `xtc_threshold`: Set a minimum probability threshold for tokens to be removed via XTC sampler. Default: `0.1` (> `0.5` disables XTC)
    `mirostat`: Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0.
    `mirostat_tau`: Set the Mirostat target entropy, parameter tau. Default: `5.0`
@ -407,9 +412,9 @@ node index.js
    `id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot.  Default: `-1`
-    `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `false`
+    `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `true`
-    `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["top_k", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values.
+    `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values.
 **Response format**
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@ -12,7 +12,7 @@
    .markdown {
      h1, h2, h3, h4, h5, h6, ul, ol, li { all: revert; }
      pre {
-        @apply whitespace-pre-wrap my-4 rounded-lg p-2;
+        @apply whitespace-pre-wrap rounded-lg p-2;
        border: 1px solid currentColor;
      }
      /* TODO: fix markdown table */
@ -25,8 +25,11 @@
    .bg-base-200 {background-color: var(--fallback-b2,oklch(var(--b2)/1))}
    .bg-base-300 {background-color: var(--fallback-b3,oklch(var(--b3)/1))}
    .text-base-content {color: var(--fallback-bc,oklch(var(--bc)/1))}
    .show-on-hover {
      @apply opacity-0 group-hover:opacity-100;
    }
    .btn-mini {
-      @apply cursor-pointer opacity-0 group-hover:opacity-100 hover:shadow-md;
+      @apply cursor-pointer hover:shadow-md;
    }
    .chat-screen { max-width: 900px; }
    /* because the default bubble color is quite dark, we will make a custom one using bg-base-300 */
@ -78,7 +81,13 @@
              <path d="M14.5 3a1 1 0 0 1-1 1H13v9a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2V4h-.5a1 1 0 0 1-1-1V2a1 1 0 0 1 1-1H6a1 1 0 0 1 1-1h2a1 1 0 0 1 1 1h3.5a1 1 0 0 1 1 1zM4.118 4 4 4.059V13a1 1 0 0 0 1 1h6a1 1 0 0 0 1-1V4.059L11.882 4zM2.5 3h11V2h-11z"/>
            </svg>
          </button>
-
+          <button v-if="messages.length > 0" class="btn mr-1" @click="downloadConv(viewingConvId)" :disabled="isGenerating">
              <!-- download conversation button -->
              <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-download" viewBox="0 0 16 16">
                  <path d="M.5 9.9a.5.5 0 0 1 .5.5v2.5a1 1 0 0 0 1 1h12a1 1 0 0 0 1-1v-2.5a.5.5 0 0 1 1 0v2.5a2 2 0 0 1-2 2H2a2 2 0 0 1-2-2v-2.5a.5.5 0 0 1 .5-.5"/>
                  <path d="M7.646 11.854a.5.5 0 0 0 .708 0l3-3a.5.5 0 0 0-.708-.708L8.5 10.293V1.5a.5.5 0 0 0-1 0v8.793L5.354 8.146a.5.5 0 1 0-.708.708z"/>
            </svg>
          </button>
          <button class="btn" @click="showConfigDialog = true" :disabled="isGenerating">
            <!-- edit config button -->
            <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-gear" viewBox="0 0 16 16">
@ -152,14 +161,14 @@
          <!-- actions for each message -->
          <div :class="{'text-right': msg.role === 'user'}" class="mx-4 mt-2 mb-2">
            <!-- user message -->
-            <button v-if="msg.role === 'user'" class="badge btn-mini" @click="editingMsg = msg" :disabled="isGenerating">
+            <button v-if="msg.role === 'user'" class="badge btn-minishow-on-hover " @click="editingMsg = msg" :disabled="isGenerating">
              ✍️ Edit
            </button>
            <!-- assistant message -->
-            <button v-if="msg.role === 'assistant'" class="badge btn-mini mr-2" @click="regenerateMsg(msg)" :disabled="isGenerating">
+            <button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="regenerateMsg(msg)" :disabled="isGenerating">
              🔄 Regenerate
            </button>
-            <button v-if="msg.role === 'assistant'" class="badge btn-mini mr-2" @click="copyMsg(msg)" :disabled="isGenerating">
+            <button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="copyMsg(msg)" :disabled="isGenerating">
              📋 Copy
            </button>
          </div>
@ -196,27 +205,47 @@
        <h3 class="text-lg font-bold mb-6">Settings</h3>
        <div class="h-[calc(90vh-12rem)] overflow-y-auto">
          <p class="opacity-40 mb-6">Settings below are saved in browser's localStorage</p>
          <settings-modal-short-input :config-key="'apiKey'" :config-default="configDefault" :config-info="configInfo" v-model="config.apiKey"></settings-modal-short-input>
          <label class="form-control mb-2">
            <div class="label">System Message</div>
            <textarea class="textarea textarea-bordered h-24" :placeholder="'Default: ' + configDefault.systemMessage" v-model="config.systemMessage"></textarea>
          </label>
-          <template v-for="key in ['temperature', 'top_k', 'top_p', 'min_p', 'max_tokens']">
+          <template v-for="configKey in ['temperature', 'top_k', 'top_p', 'min_p', 'max_tokens']">
-            <label class="input input-bordered flex items-center gap-2 mb-2">
+            <settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]" />
              <b>{{ key }}</b>
              <input type="text" class="grow" :placeholder="'Default: ' + (configDefault[key] || 'none')" v-model="config[key]" />
            </label>
          </template>
          <!-- TODO: add more sampling-related configs, please regroup them into different "collapse" sections -->
-          <div class="collapse collapse-arrow bg-base-200 mb-2">
+          <!-- Section: Other sampler settings -->
-            <input type="checkbox" />
+          <details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
-            <div class="collapse-title font-bold">Advanced config</div>
+            <summary class="collapse-title font-bold">Other sampler settings</summary>
            <div class="collapse-content">
              <!-- Samplers queue -->
              <settings-modal-short-input label="Samplers queue" :config-key="'samplers'" :config-default="configDefault" :config-info="configInfo" v-model="config.samplers"></settings-modal-short-input>
              <!-- Samplers -->
              <template v-for="configKey in ['dynatemp_range', 'dynatemp_exponent', 'typical_p', 'xtc_probability', 'xtc_threshold']">
                <settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]" />
              </template>
            </div>
          </details>
          <!-- Section: Penalties settings -->
          <details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
            <summary class="collapse-title font-bold">Penalties settings</summary>
            <div class="collapse-content">
              <template v-for="configKey in ['repeat_last_n', 'repeat_penalty', 'presence_penalty', 'frequency_penalty', 'dry_multiplier', 'dry_base', 'dry_allowed_length', 'dry_penalty_last_n']">
                <settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]" />
              </template>
            </div>
          </details>
          <!-- Section: Advanced config -->
          <details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
            <summary class="collapse-title font-bold">Advanced config</summary>
            <div class="collapse-content">
              <label class="form-control mb-2">
                <!-- Custom parameters input -->
                <div class="label inline">Custom JSON config (For more info, refer to <a class="underline" href="https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md" target="_blank" rel="noopener noreferrer">server documentation</a>)</div>
                <textarea class="textarea textarea-bordered h-24" placeholder="Example: { &quot;mirostat&quot;: 1, &quot;min_p&quot;: 0.1 }" v-model="config.custom"></textarea>
              </label>
            </div>
-          </div>
+          </details>
        </div>
        <!-- action buttons -->
@ -229,14 +258,33 @@
    </dialog>
  </div>
  <!-- Template to be used by settings modal -->
  <template id="settings-modal-short-input">
    <label class="input input-bordered join-item grow flex items-center gap-2 mb-2">
      <!-- Show help message on hovering on the input label -->
      <div class="dropdown dropdown-hover">
        <div tabindex="0" role="button" class="font-bold">{{ label || configKey }}</div>
        <div class="dropdown-content menu bg-base-100 rounded-box z-10 w-64 p-2 shadow mt-4">
          {{ configInfo[configKey] || '(no help message available)' }}
        </div>
      </div>
      <!-- Here we forward v-model from parent to child component, see: https://stackoverflow.com/questions/47311936/v-model-and-child-components -->
      <input type="text" class="grow" :placeholder="'Default: ' + (configDefault[configKey] || 'none')" :value="modelValue" @input="$emit('update:modelValue', $event.target.value)" />
    </label>
  </template>
  <script src="./deps_markdown-it.js"></script>
  <script type="module">
    import { createApp, defineComponent, shallowRef, computed, h } from './deps_vue.esm-browser.js';
    import { llama } from './completion.js';
    // utility functions
    const isString = (x) => !!x.toLowerCase;
    const isNumeric = (n) => !isString(n) && !isNaN(n);
    const escapeAttr = (str) => str.replace(/>/g, '&gt;').replace(/"/g, '&quot;');
    const copyStr = (str) => navigator.clipboard.writeText(str);
    // constants
    const BASE_URL = localStorage.getItem('base') // for debugging
      || (new URL('.', document.baseURI).href).toString(); // for production
    const CONFIG_DEFAULT = {
@ -244,13 +292,51 @@
      apiKey: '',
      systemMessage: 'You are a helpful assistant.',
      // make sure these default values are in sync with `common.h`
      samplers: 'dkypmxt',
      temperature: 0.8,
      dynatemp_range: 0.0,
      dynatemp_exponent: 1.0,
      top_k: 40,
      top_p: 0.95,
      min_p: 0.05,
      xtc_probability: 0.0,
      xtc_threshold: 0.1,
      typical_p: 1.0,
      repeat_last_n: 64,
      repeat_penalty: 1.0,
      presence_penalty: 0.0,
      frequency_penalty: 0.0,
      dry_multiplier: 0.0,
      dry_base: 1.75,
      dry_allowed_length: 2,
      dry_penalty_last_n: -1,
      max_tokens: -1,
      custom: '', // custom json-stringified object
    };
    const CONFIG_INFO = {
      apiKey: 'Set the API Key if you are using --api-key option for the server.',
      systemMessage: 'The starting message that defines how model should behave.',
      samplers: 'The order at which samplers are applied, in simplified way. Default is "dkypmxt": dry->top_k->typ_p->top_p->min_p->xtc->temperature',
      temperature: 'Controls the randomness of the generated text by affecting the probability distribution of the output tokens. Higher = more random, lower = more focused.',
      dynatemp_range: 'Addon for the temperature sampler. The added value to the range of dynamic temperature, which adjusts probabilities by entropy of tokens.',
      dynatemp_exponent: 'Addon for the temperature sampler. Smoothes out the probability redistribution based on the most probable token.',
      top_k: 'Keeps only k top tokens.',
      top_p: 'Limits tokens to those that together have a cumulative probability of at least p',
      min_p: 'Limits tokens based on the minimum probability for a token to be considered, relative to the probability of the most likely token.',
      xtc_probability: 'XTC sampler cuts out top tokens; this parameter controls the chance of cutting tokens at all. 0 disables XTC.',
      xtc_threshold: 'XTC sampler cuts out top tokens; this parameter controls the token probability that is required to cut that token.',
      typical_p: 'Sorts and limits tokens based on the difference between log-probability and entropy.',
      repeat_last_n: 'Last n tokens to consider for penalizing repetition',
      repeat_penalty: 'Controls the repetition of token sequences in the generated text',
      presence_penalty: 'Limits tokens based on whether they appear in the output or not.',
      frequency_penalty: 'Limits tokens based on how often they appear in the output.',
      dry_multiplier: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling multiplier.',
      dry_base: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the DRY sampling base value.',
      dry_allowed_length: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets the allowed length for DRY sampling.',
      dry_penalty_last_n: 'DRY sampling reduces repetition in generated text even across long contexts. This parameter sets DRY penalty for the last n tokens.',
      max_tokens: 'The maximum number of token per output.',
      custom: '', // custom json-stringified object
    };
    // config keys having numeric value (i.e. temperature, top_k, top_p, etc)
    const CONFIG_NUMERIC_KEYS = Object.entries(CONFIG_DEFAULT).filter(e => isNumeric(e[1])).map(e => e[0]);
    // list of themes supported by daisyui
@ -259,16 +345,37 @@
    // markdown support
    const VueMarkdown = defineComponent(
      (props) => {
-        const md = shallowRef(new markdownit(props.options ?? { breaks: true }));
+        const md = shallowRef(new markdownit({ breaks: true }));
-        for (const plugin of props.plugins ?? []) {
+        const origFenchRenderer = md.value.renderer.rules.fence;
-          md.value.use(plugin);
+        md.value.renderer.rules.fence = (tokens, idx, ...args) => {
-        }
+          const content = tokens[idx].content;
          const origRendered = origFenchRenderer(tokens, idx, ...args);
          return `<div class="relative my-4">
            <div class="text-right sticky top-4 mb-2 mr-2 h-0">
              <button class="badge btn-mini" onclick="copyStr(${escapeAttr(JSON.stringify(content))})">📋 Copy</button>
            </div>
            ${origRendered}
          </div>`;
        };
        window.copyStr = copyStr;
        const content = computed(() => md.value.render(props.source));
        return () => h("div", { innerHTML: content.value });
      },
-      { props: ["source", "options", "plugins"] }
+      { props: ["source"] }
    );
    // input field to be used by settings modal
    const SettingsModalShortInput = defineComponent({
      template: document.getElementById('settings-modal-short-input').innerHTML,
      props: {
        label: { type: String, required: false },
        configKey: String,
        configDefault: Object,
        configInfo: Object,
        modelValue: [Object, String, Number],
      },
    });
    // coversations is stored in localStorage
    // format: { [convId]: { id: string, lastModified: number, messages: [...] } }
    // convId is a string prefixed with 'conv-'
@ -318,7 +425,11 @@
        if (!conv) return;
        const msg = conv.messages.pop();
        conv.lastModified = Date.now();
-        localStorage.setItem(convId, JSON.stringify(conv));
+        if (conv.messages.length === 0) {
          StorageUtils.remove(convId);
        } else {
          localStorage.setItem(convId, JSON.stringify(conv));
        }
        return msg;
      },
@ -359,6 +470,7 @@
    const mainApp = createApp({
      components: {
        VueMarkdown,
        SettingsModalShortInput,
      },
      data() {
        return {
@ -376,6 +488,7 @@
          // const
          themes: THEMES,
          configDefault: {...CONFIG_DEFAULT},
          configInfo: {...CONFIG_INFO},
        }
      },
      computed: {},
@ -419,6 +532,23 @@
            this.fetchMessages();
          }
        },
        downloadConv(convId) {
          const conversation = StorageUtils.getOneConversation(convId);
          if (!conversation) {
            alert('Conversation not found.');
            return;
          }
          const conversationJson = JSON.stringify(conversation, null, 2);
          const blob = new Blob([conversationJson], { type: 'application/json' });
          const url = URL.createObjectURL(blob);
          const a = document.createElement('a');
          a.href = url;
          a.download = `conversation_${convId}.json`;
          document.body.appendChild(a);
          a.click();
          document.body.removeChild(a);
          URL.revokeObjectURL(url);
        },
        async sendMessage() {
          if (!this.inputMsg) return;
          const currConvId = this.viewingConvId;
@ -451,9 +581,24 @@
              ],
              stream: true,
              cache_prompt: true,
              samplers: this.config.samplers,
              temperature: this.config.temperature,
              dynatemp_range: this.config.dynatemp_range,
              dynatemp_exponent: this.config.dynatemp_exponent,
              top_k: this.config.top_k,
              top_p: this.config.top_p,
              min_p: this.config.min_p,
              typical_p: this.config.typical_p,
              xtc_probability: this.config.xtc_probability,
              xtc_threshold: this.config.xtc_threshold,
              repeat_last_n: this.config.repeat_last_n,
              repeat_penalty: this.config.repeat_penalty,
              presence_penalty: this.config.presence_penalty,
              frequency_penalty: this.config.frequency_penalty,
              dry_multiplier: this.config.dry_multiplier,
              dry_base: this.config.dry_base,
              dry_allowed_length: this.config.dry_allowed_length,
              dry_penalty_last_n: this.config.dry_penalty_last_n,
              max_tokens: this.config.max_tokens,
              ...(this.config.custom.length ? JSON.parse(this.config.custom) : {}),
              ...(this.config.apiKey ? { api_key: this.config.apiKey } : {}),
@ -499,6 +644,7 @@
          this.isGenerating = false;
          this.stopGeneration = () => {};
          this.fetchMessages();
          chatScrollToBottom();
        },
        // message actions
@ -512,7 +658,7 @@
          this.generateMessage(currConvId);
        },
        copyMsg(msg) {
-          navigator.clipboard.writeText(msg.content);
+          copyStr(msg.content);
        },
        editUserMsgAndRegenerate(msg) {
          if (this.isGenerating) return;
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -2,10 +2,11 @@
 #include "arg.h"
 #include "common.h"
 #include "log.h"
 #include "sampling.h"
 #include "json-schema-to-grammar.h"
 #include "llama.h"
 #include "log.h"
 #include "sampling.h"
 #include "speculative.h"
 // Change JSON_ASSERT from assert() to GGML_ASSERT:
 #define JSON_ASSERT GGML_ASSERT
@ -102,9 +103,15 @@ struct server_task_result {
    bool error;
 };
 struct server_static_file {
    const unsigned char * data;
    unsigned int size;
    const char * mime_type;
 };
 struct slot_params {
    bool stream       = true;
-    bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
+    bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
    int32_t n_keep    =  0; // number of tokens to keep from initial prompt
    int32_t n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
@ -115,12 +122,21 @@ struct slot_params {
    int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
    std::vector<std::string> antiprompt;
    struct common_params_sampling sampling;
    struct common_params_speculative speculative;
 };
 struct server_slot {
    int id;
    int id_task = -1;
    llama_batch batch_spec;
    llama_context * ctx_dft = nullptr;
    common_speculative * spec = nullptr;
    // the index relative to completion multi-task request
    size_t index = 0;
@ -169,7 +185,6 @@ struct server_slot {
    // sampling
    json json_schema;
    struct common_sampler_params sparams;
    struct common_sampler * smpl = nullptr;
    llama_token sampled;
@ -206,7 +221,7 @@ struct server_slot {
        generated_token_probs.clear();
    }
-    bool has_budget(common_params &global_params) {
+    bool has_budget(const common_params & global_params) {
        if (params.n_predict == -1 && global_params.n_predict == -1) {
            return true; // limitless
        }
@ -226,6 +241,10 @@ struct server_slot {
        return state != SLOT_STATE_IDLE;
    }
    bool can_speculate() const {
        return ctx_dft && params.speculative.n_max > 0 && params.cache_prompt;
    }
    void add_token(const completion_token_output & token) {
        if (!is_processing()) {
            SLT_WRN(*this, "%s", "slot is not processing\n");
@ -585,11 +604,14 @@ struct server_response {
 };
 struct server_context {
    common_params params_base;
    llama_model * model = nullptr;
    llama_context * ctx = nullptr;
    std::vector<common_lora_adapter_container> loras;
-    common_params params;
+    llama_model * model_dft = nullptr;
    llama_context_params cparams_dft;
    llama_batch batch = {};
@ -622,27 +644,41 @@ struct server_context {
            model = nullptr;
        }
        if (model_dft) {
            llama_free_model(model_dft);
            model_dft = nullptr;
        }
        // Clear any sampling context
        for (server_slot & slot : slots) {
-            if (slot.smpl != nullptr) {
+            common_sampler_free(slot.smpl);
-                common_sampler_free(slot.smpl);
+            slot.smpl = nullptr;
-            }
+
            llama_free(slot.ctx_dft);
            slot.ctx_dft = nullptr;
            common_speculative_free(slot.spec);
            slot.spec = nullptr;
            llama_batch_free(slot.batch_spec);
        }
        llama_batch_free(batch);
    }
-    bool load_model(const common_params & params_) {
+    bool load_model(const common_params & params) {
-        params = params_;
+        SRV_INF("loading model '%s'\n", params.model.c_str());
-        common_init_result llama_init = common_init_from_params(params);
+        params_base = params;
        common_init_result llama_init = common_init_from_params(params_base);
        model = llama_init.model;
        ctx   = llama_init.context;
        loras = llama_init.lora_adapters;
        if (model == nullptr) {
-            SRV_ERR("failed to load model, '%s'\n", params.model.c_str());
+            SRV_ERR("failed to load model, '%s'\n", params_base.model.c_str());
            return false;
        }
@ -651,32 +687,88 @@ struct server_context {
        add_bos_token = llama_add_bos_token(model);
        has_eos_token = !llama_add_eos_token(model);
        if (!params_base.speculative.model.empty()) {
            SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str());
            auto params_dft = params_base;
            params_dft.devices      = params_base.speculative.devices;
            params_dft.model        = params_base.speculative.model;
            params_dft.n_ctx        = params_base.speculative.n_ctx;
            params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
            common_init_result llama_init_dft = common_init_from_params(params_dft);
            model_dft = llama_init_dft.model;
            if (model_dft == nullptr) {
                SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.c_str());
                return false;
            }
            if (!common_speculative_are_compatible(ctx, llama_init_dft.context)) {
                SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.c_str(), params_base.model.c_str());
                llama_free      (llama_init_dft.context);
                llama_free_model(llama_init_dft.model);
                return false;
            }
            cparams_dft = common_context_params_to_llama(params_base);
            cparams_dft.n_batch = llama_n_ctx(llama_init_dft.context);
            // the context is not needed - we will create one for each slot
            llama_free(llama_init_dft.context);
        }
        return true;
    }
    bool validate_model_chat_template() const {
-        llama_chat_message chat[] = {{"user", "test"}};
+        std::vector<char> model_template(2048, 0); // longest known template is about 1200 bytes
-
+        std::string template_key = "tokenizer.chat_template";
-        const int res = llama_chat_apply_template(model, nullptr, chat, 1, true, nullptr, 0);
+        int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
-
+        if (res >= 0) {
-        return res > 0;
+            llama_chat_message chat[] = {{"user", "test"}};
            std::string tmpl = std::string(model_template.data(), model_template.size());
            int32_t chat_res = llama_chat_apply_template(model, tmpl.c_str(), chat, 1, true, nullptr, 0);
            return chat_res > 0;
        }
        return false;
    }
    void init() {
-        const int32_t n_ctx_slot = n_ctx / params.n_parallel;
+        const int32_t n_ctx_slot = n_ctx / params_base.n_parallel;
-        SRV_INF("initializing slots, n_slots = %d\n", params.n_parallel);
+        SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel);
-        for (int i = 0; i < params.n_parallel; i++) {
+        for (int i = 0; i < params_base.n_parallel; i++) {
            server_slot slot;
            slot.id = i;
            slot.n_ctx = n_ctx_slot;
-            slot.n_predict = params.n_predict;
+            slot.n_predict = params_base.n_predict;
            if (model_dft) {
                slot.batch_spec = llama_batch_init(params_base.speculative.n_max + 1, 0, 1);
                slot.ctx_dft = llama_new_context_with_model(model_dft, cparams_dft);
                if (slot.ctx_dft == nullptr) {
                    SRV_ERR("%s", "failed to create draft context\n");
                    return;
                }
                slot.spec = common_speculative_init(slot.ctx_dft);
                if (slot.spec == nullptr) {
                    SRV_ERR("%s", "failed to create speculator\n");
                    return;
                }
            }
            SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
-            slot.sparams = params.sparams;
+            slot.params.sampling = params_base.sampling;
            slot.callback_on_release = [this](int) {
                queue_tasks.pop_deferred_task();
@ -696,7 +788,7 @@ struct server_context {
            const int32_t n_batch = llama_n_batch(ctx);
            // only a single seq_id per token is needed
-            batch = llama_batch_init(std::max(n_batch, params.n_parallel), 0, 1);
+            batch = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1);
        }
        metrics.init();
@ -732,7 +824,7 @@ struct server_context {
                }
                // length of the Longest Common Subsequence between the current slot's prompt and the input prompt
-                int cur_lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens);
+                int cur_lcs_len = common_lcs(slot.cache_tokens, task.prompt_tokens);
                // fraction of the common subsequence length compared to the current slot's prompt length
                float cur_similarity = static_cast<float>(cur_lcs_len) / static_cast<int>(slot.cache_tokens.size());
@ -775,9 +867,11 @@ struct server_context {
    }
    bool launch_slot_with_task(server_slot & slot, const server_task & task) {
        slot_params default_params;
        // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them)
-        auto default_sparams = params.sparams;
+        slot_params defaults;
        defaults.sampling    = params_base.sampling;
        defaults.speculative = params_base.speculative;
        const auto & data = task.data;
        if (data.count("__oaicompat") != 0) {
@ -788,42 +882,48 @@ struct server_context {
            slot.oaicompat_model = "";
        }
-        slot.params.stream              = json_value(data, "stream",             false);
+        slot.params.stream           = json_value(data, "stream",             false);
-        slot.params.cache_prompt        = json_value(data, "cache_prompt",       false);
+        slot.params.cache_prompt     = json_value(data, "cache_prompt",       true);
-        slot.params.n_predict           = json_value(data, "n_predict",          json_value(data, "max_tokens", default_params.n_predict));
+        slot.params.n_predict        = json_value(data, "n_predict",          json_value(data, "max_tokens", defaults.n_predict));
-        slot.params.n_indent            = json_value(data, "n_indent",           default_params.n_indent);
+        slot.params.n_indent         = json_value(data, "n_indent",           defaults.n_indent);
-        slot.sparams.top_k              = json_value(data, "top_k",              default_sparams.top_k);
+        slot.params.n_keep           = json_value(data, "n_keep",             defaults.n_keep);
-        slot.sparams.top_p              = json_value(data, "top_p",              default_sparams.top_p);
+        slot.params.n_discard        = json_value(data, "n_discard",          defaults.n_discard);
-        slot.sparams.min_p              = json_value(data, "min_p",              default_sparams.min_p);
+      //slot.params.t_max_prompt_ms  = json_value(data, "t_max_prompt_ms",    defaults.t_max_prompt_ms); // TODO: implement
-        slot.sparams.xtc_probability    = json_value(data, "xtc_probability",    default_sparams.xtc_probability);
+        slot.params.t_max_predict_ms = json_value(data, "t_max_predict_ms",   defaults.t_max_predict_ms);
        slot.sparams.xtc_threshold      = json_value(data, "xtc_threshold",      default_sparams.xtc_threshold);
        slot.sparams.typ_p              = json_value(data, "typical_p",          default_sparams.typ_p);
        slot.sparams.temp               = json_value(data, "temperature",        default_sparams.temp);
        slot.sparams.dynatemp_range     = json_value(data, "dynatemp_range",     default_sparams.dynatemp_range);
        slot.sparams.dynatemp_exponent  = json_value(data, "dynatemp_exponent",  default_sparams.dynatemp_exponent);
        slot.sparams.penalty_last_n     = json_value(data, "repeat_last_n",      default_sparams.penalty_last_n);
        slot.sparams.penalty_repeat     = json_value(data, "repeat_penalty",     default_sparams.penalty_repeat);
        slot.sparams.penalty_freq       = json_value(data, "frequency_penalty",  default_sparams.penalty_freq);
        slot.sparams.penalty_present    = json_value(data, "presence_penalty",   default_sparams.penalty_present);
        slot.sparams.dry_multiplier     = json_value(data, "dry_multiplier",     default_sparams.dry_multiplier);
        slot.sparams.dry_base           = json_value(data, "dry_base",           default_sparams.dry_base);
        slot.sparams.dry_allowed_length = json_value(data, "dry_allowed_length", default_sparams.dry_allowed_length);
        slot.sparams.dry_penalty_last_n = json_value(data, "dry_penalty_last_n", default_sparams.dry_penalty_last_n);
        slot.sparams.mirostat           = json_value(data, "mirostat",           default_sparams.mirostat);
        slot.sparams.mirostat_tau       = json_value(data, "mirostat_tau",       default_sparams.mirostat_tau);
        slot.sparams.mirostat_eta       = json_value(data, "mirostat_eta",       default_sparams.mirostat_eta);
        slot.sparams.penalize_nl        = json_value(data, "penalize_nl",        default_sparams.penalize_nl);
        slot.params.n_keep              = json_value(data, "n_keep",             default_params.n_keep);
        slot.params.n_discard           = json_value(data, "n_discard",          default_params.n_discard);
        slot.sparams.seed               = json_value(data, "seed",               default_sparams.seed);
        slot.sparams.n_probs            = json_value(data, "n_probs",            default_sparams.n_probs);
        slot.sparams.min_keep           = json_value(data, "min_keep",           default_sparams.min_keep);
      //slot.params.t_max_prompt_ms     = json_value(data, "t_max_prompt_ms",    default_params.t_max_prompt_ms); // TODO: implement
        slot.params.t_max_predict_ms    = json_value(data, "t_max_predict_ms",   default_params.t_max_predict_ms);
-        if (slot.sparams.dry_base < 1.0f)
+        slot.params.sampling.top_k              = json_value(data, "top_k",              defaults.sampling.top_k);
-        {
+        slot.params.sampling.top_p              = json_value(data, "top_p",              defaults.sampling.top_p);
-           slot.sparams.dry_base = default_sparams.dry_base;
+        slot.params.sampling.min_p              = json_value(data, "min_p",              defaults.sampling.min_p);
        slot.params.sampling.xtc_probability    = json_value(data, "xtc_probability",    defaults.sampling.xtc_probability);
        slot.params.sampling.xtc_threshold      = json_value(data, "xtc_threshold",      defaults.sampling.xtc_threshold);
        slot.params.sampling.typ_p              = json_value(data, "typical_p",          defaults.sampling.typ_p);
        slot.params.sampling.temp               = json_value(data, "temperature",        defaults.sampling.temp);
        slot.params.sampling.dynatemp_range     = json_value(data, "dynatemp_range",     defaults.sampling.dynatemp_range);
        slot.params.sampling.dynatemp_exponent  = json_value(data, "dynatemp_exponent",  defaults.sampling.dynatemp_exponent);
        slot.params.sampling.penalty_last_n     = json_value(data, "repeat_last_n",      defaults.sampling.penalty_last_n);
        slot.params.sampling.penalty_repeat     = json_value(data, "repeat_penalty",     defaults.sampling.penalty_repeat);
        slot.params.sampling.penalty_freq       = json_value(data, "frequency_penalty",  defaults.sampling.penalty_freq);
        slot.params.sampling.penalty_present    = json_value(data, "presence_penalty",   defaults.sampling.penalty_present);
        slot.params.sampling.dry_multiplier     = json_value(data, "dry_multiplier",     defaults.sampling.dry_multiplier);
        slot.params.sampling.dry_base           = json_value(data, "dry_base",           defaults.sampling.dry_base);
        slot.params.sampling.dry_allowed_length = json_value(data, "dry_allowed_length", defaults.sampling.dry_allowed_length);
        slot.params.sampling.dry_penalty_last_n = json_value(data, "dry_penalty_last_n", defaults.sampling.dry_penalty_last_n);
        slot.params.sampling.mirostat           = json_value(data, "mirostat",           defaults.sampling.mirostat);
        slot.params.sampling.mirostat_tau       = json_value(data, "mirostat_tau",       defaults.sampling.mirostat_tau);
        slot.params.sampling.mirostat_eta       = json_value(data, "mirostat_eta",       defaults.sampling.mirostat_eta);
        slot.params.sampling.penalize_nl        = json_value(data, "penalize_nl",        defaults.sampling.penalize_nl);
        slot.params.sampling.seed               = json_value(data, "seed",               defaults.sampling.seed);
        slot.params.sampling.n_probs            = json_value(data, "n_probs",            defaults.sampling.n_probs);
        slot.params.sampling.min_keep           = json_value(data, "min_keep",           defaults.sampling.min_keep);
        slot.params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min);
        slot.params.speculative.n_max = json_value(data, "speculative.n_max", defaults.speculative.n_max);
        slot.params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min);
        slot.params.speculative.n_min = std::min(slot.params.speculative.n_max, slot.params.speculative.n_min);
        if (slot.params.sampling.dry_base < 1.0f) {
           slot.params.sampling.dry_base = defaults.sampling.dry_base;
        }
        // sequence breakers for DRY
@ -832,8 +932,8 @@ struct server_context {
            // Ref: https://github.com/oobabooga/text-generation-webui/blob/d1af7a41ade7bd3c3a463bfa640725edb818ebaf/extensions/openai/typing.py#L39
            if (data.contains("dry_sequence_breakers")) {
-                slot.sparams.dry_sequence_breakers = json_value(data, "dry_sequence_breakers", std::vector<std::string>());
+                slot.params.sampling.dry_sequence_breakers = json_value(data, "dry_sequence_breakers", std::vector<std::string>());
-                if (slot.sparams.dry_sequence_breakers.empty()) {
+                if (slot.params.sampling.dry_sequence_breakers.empty()) {
                    send_error(task, "Error: dry_sequence_breakers must be a non-empty array of strings", ERROR_TYPE_INVALID_REQUEST);
                    return false;
                }
@ -847,14 +947,14 @@ struct server_context {
        }
        if (data.contains("json_schema") && !data.contains("grammar")) {
            try {
-                auto schema          = json_value(data, "json_schema", json::object());
+                auto schema                  = json_value(data, "json_schema", json::object());
-                slot.sparams.grammar = json_schema_to_grammar(schema);
+                slot.params.sampling.grammar = json_schema_to_grammar(schema);
            } catch (const std::exception & e) {
                send_error(task, std::string("\"json_schema\": ") + e.what(), ERROR_TYPE_INVALID_REQUEST);
                return false;
            }
        } else {
-            slot.sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
+            slot.params.sampling.grammar = json_value(data, "grammar", defaults.sampling.grammar);
        }
        if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
@ -864,10 +964,10 @@ struct server_context {
        }
        {
-            slot.sparams.logit_bias.clear();
+            slot.params.sampling.logit_bias.clear();
            if (json_value(data, "ignore_eos", false) && has_eos_token) {
-                slot.sparams.logit_bias.push_back({llama_token_eos(model), -INFINITY});
+                slot.params.sampling.logit_bias.push_back({llama_token_eos(model), -INFINITY});
            }
            const auto & logit_bias = data.find("logit_bias");
@ -888,12 +988,12 @@ struct server_context {
                        if (el[0].is_number_integer()) {
                            llama_token tok = el[0].get<llama_token>();
                            if (tok >= 0 && tok < n_vocab) {
-                                slot.sparams.logit_bias.push_back({tok, bias});
+                                slot.params.sampling.logit_bias.push_back({tok, bias});
                            }
                        } else if (el[0].is_string()) {
                            auto toks = common_tokenize(model, el[0].get<std::string>(), false);
                            for (auto tok : toks) {
-                                slot.sparams.logit_bias.push_back({tok, bias});
+                                slot.params.sampling.logit_bias.push_back({tok, bias});
                            }
                        }
                    }
@ -916,16 +1016,24 @@ struct server_context {
        {
            const auto & samplers = data.find("samplers");
-            if (samplers != data.end() && samplers->is_array()) {
+            if (samplers != data.end()) {
-                std::vector<std::string> sampler_names;
+                if (samplers->is_array()) {
-                for (const auto & name : *samplers) {
+                    std::vector<std::string> sampler_names;
-                    if (name.is_string()) {
+                    for (const auto & name : *samplers) {
-                        sampler_names.emplace_back(name);
+                        if (name.is_string()) {
                            sampler_names.emplace_back(name);
                        }
                    }
                    slot.params.sampling.samplers = common_sampler_types_from_names(sampler_names, false);
                } else if (samplers->is_string()){
                    std::string sampler_string;
                    for (const auto & name : *samplers) {
                        sampler_string += name;
                    }
                    slot.params.sampling.samplers = common_sampler_types_from_chars(sampler_string);
                }
                slot.sparams.samplers = common_sampler_types_from_names(sampler_names, false);
            } else {
-                slot.sparams.samplers = default_sparams.samplers;
+                slot.params.sampling.samplers = defaults.sampling.samplers;
            }
        }
@ -934,7 +1042,7 @@ struct server_context {
                common_sampler_free(slot.smpl);
            }
-            slot.smpl = common_sampler_init(model, slot.sparams);
+            slot.smpl = common_sampler_init(model, slot.params.sampling);
            if (slot.smpl == nullptr) {
                // for now, the only error that may happen here is invalid grammar
                send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);
@ -942,6 +1050,12 @@ struct server_context {
            }
        }
        if (slot.ctx_dft) {
            llama_batch_free(slot.batch_spec);
            slot.batch_spec = llama_batch_init(slot.params.speculative.n_max + 1, 0, 1);
        }
        slot.state = SLOT_STATE_STARTED;
        SLT_INF(slot, "%s", "processing task\n");
@ -959,7 +1073,7 @@ struct server_context {
    bool process_token(completion_token_output & result, server_slot & slot) {
        // remember which tokens were sampled - used for repetition penalties during sampling
-        const std::string token_str = common_token_to_piece(ctx, result.tok, params.special);
+        const std::string token_str = common_token_to_piece(ctx, result.tok, params_base.special);
        slot.sampled = result.tok;
        // search stop word and delete it
@ -1024,7 +1138,7 @@ struct server_context {
        }
        // check the limits
-        if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params)) {
+        if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params_base)) {
            slot.stopped_limit  = true;
            slot.has_next_token = false;
@ -1117,50 +1231,54 @@ struct server_context {
    json get_formated_generation(const server_slot & slot) const {
        std::vector<std::string> samplers;
-        samplers.reserve(slot.sparams.samplers.size());
+        samplers.reserve(slot.params.sampling.samplers.size());
-        for (const auto & sampler : slot.sparams.samplers) {
+        for (const auto & sampler : slot.params.sampling.samplers) {
            samplers.emplace_back(common_sampler_type_to_str(sampler));
        }
        return json {
            {"n_ctx",                     slot.n_ctx},
            {"n_predict",                 slot.n_predict},     // Server configured n_predict
-            {"model",                     params.model_alias},
+            {"model",                     params_base.model_alias},
-            {"seed",                      slot.sparams.seed},
+            {"seed",                      slot.params.sampling.seed},
            {"seed_cur",                  slot.smpl ? common_sampler_get_seed(slot.smpl) : 0},
-            {"temperature",               slot.sparams.temp},
+            {"temperature",               slot.params.sampling.temp},
-            {"dynatemp_range",            slot.sparams.dynatemp_range},
+            {"dynatemp_range",            slot.params.sampling.dynatemp_range},
-            {"dynatemp_exponent",         slot.sparams.dynatemp_exponent},
+            {"dynatemp_exponent",         slot.params.sampling.dynatemp_exponent},
-            {"top_k",                     slot.sparams.top_k},
+            {"top_k",                     slot.params.sampling.top_k},
-            {"top_p",                     slot.sparams.top_p},
+            {"top_p",                     slot.params.sampling.top_p},
-            {"min_p",                     slot.sparams.min_p},
+            {"min_p",                     slot.params.sampling.min_p},
-            {"xtc_probability",           slot.sparams.xtc_probability},
+            {"xtc_probability",           slot.params.sampling.xtc_probability},
-            {"xtc_threshold",             slot.sparams.xtc_threshold},
+            {"xtc_threshold",             slot.params.sampling.xtc_threshold},
-            {"typical_p",                 slot.sparams.typ_p},
+            {"typical_p",                 slot.params.sampling.typ_p},
-            {"repeat_last_n",             slot.sparams.penalty_last_n},
+            {"repeat_last_n",             slot.params.sampling.penalty_last_n},
-            {"repeat_penalty",            slot.sparams.penalty_repeat},
+            {"repeat_penalty",            slot.params.sampling.penalty_repeat},
-            {"presence_penalty",          slot.sparams.penalty_present},
+            {"presence_penalty",          slot.params.sampling.penalty_present},
-            {"frequency_penalty",         slot.sparams.penalty_freq},
+            {"frequency_penalty",         slot.params.sampling.penalty_freq},
-            {"dry_multiplier",            slot.sparams.dry_multiplier},
+            {"dry_multiplier",            slot.params.sampling.dry_multiplier},
-            {"dry_base",                  slot.sparams.dry_base},
+            {"dry_base",                  slot.params.sampling.dry_base},
-            {"dry_allowed_length",        slot.sparams.dry_allowed_length},
+            {"dry_allowed_length",        slot.params.sampling.dry_allowed_length},
-            {"dry_penalty_last_n",        slot.sparams.dry_penalty_last_n},
+            {"dry_penalty_last_n",        slot.params.sampling.dry_penalty_last_n},
-            {"dry_sequence_breakers",     slot.sparams.dry_sequence_breakers},
+            {"dry_sequence_breakers",     slot.params.sampling.dry_sequence_breakers},
-            {"mirostat",                  slot.sparams.mirostat},
+            {"mirostat",                  slot.params.sampling.mirostat},
-            {"mirostat_tau",              slot.sparams.mirostat_tau},
+            {"mirostat_tau",              slot.params.sampling.mirostat_tau},
-            {"mirostat_eta",              slot.sparams.mirostat_eta},
+            {"mirostat_eta",              slot.params.sampling.mirostat_eta},
-            {"penalize_nl",               slot.sparams.penalize_nl},
+            {"penalize_nl",               slot.params.sampling.penalize_nl},
            {"stop",                      slot.params.antiprompt},
            {"max_tokens",                slot.params.n_predict}, // User configured n_predict
            {"n_keep",                    slot.params.n_keep},
            {"n_discard",                 slot.params.n_discard},
-            {"ignore_eos",                slot.sparams.ignore_eos},
+            {"ignore_eos",                slot.params.sampling.ignore_eos},
            {"stream",                    slot.params.stream},
-          //{"logit_bias",                slot.sparams.logit_bias},
+          //{"logit_bias",                slot.params.sampling.logit_bias},
-            {"n_probs",                   slot.sparams.n_probs},
+            {"n_probs",                   slot.params.sampling.n_probs},
-            {"min_keep",                  slot.sparams.min_keep},
+            {"min_keep",                  slot.params.sampling.min_keep},
-            {"grammar",                   slot.sparams.grammar},
+            {"grammar",                   slot.params.sampling.grammar},
            {"samplers",                  samplers},
            {"speculative",               slot.can_speculate()},
            {"speculative.n_max",         slot.params.speculative.n_max},
            {"speculative.n_min",         slot.params.speculative.n_min},
            {"speculative.p_min",         slot.params.speculative.p_min},
        };
    }
@ -1197,7 +1315,7 @@ struct server_context {
            {"index",      slot.index},
        };
-        if (slot.sparams.n_probs > 0) {
+        if (slot.params.sampling.n_probs > 0) {
            const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
            const size_t probs_pos      = std::min(slot.n_sent_token_probs,                       slot.generated_token_probs.size());
            const size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size());
@ -1230,7 +1348,7 @@ struct server_context {
            {"content",             !slot.params.stream ? slot.generated_text : ""},
            {"id_slot",             slot.id},
            {"stop",                true},
-            {"model",               params.model_alias},
+            {"model",               params_base.model_alias},
            {"tokens_predicted",    slot.n_decoded},
            {"tokens_evaluated",    slot.n_prompt_tokens},
            {"generation_settings", get_formated_generation(slot)},
@ -1246,7 +1364,7 @@ struct server_context {
            {"index",               slot.index},
        };
-        if (slot.sparams.n_probs > 0) {
+        if (slot.params.sampling.n_probs > 0) {
            std::vector<completion_token_output> probs;
            if (!slot.params.stream && slot.stopped_word) {
                const llama_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);
@ -1403,10 +1521,10 @@ struct server_context {
                            data.at("input_prefix"),
                            data.at("input_suffix"),
                            data.at("input_extra"),
-                            params.n_batch,
+                            params_base.n_batch,
-                            params.n_predict,
+                            params_base.n_predict,
                            slots[0].n_ctx, // TODO: there should be a better way
-                            params.spm_infill,
+                            params_base.spm_infill,
                            tokenized_prompts[i]
                        );
                        create_task(data, tokens);
@ -1779,7 +1897,7 @@ struct server_context {
        // TODO: simplify and improve
        for (server_slot & slot : slots) {
            if (slot.is_processing() && slot.n_past + 1 >= slot.n_ctx) {
-                if (!params.ctx_shift) {
+                if (!params_base.ctx_shift) {
                    // this check is redundant (for good)
                    // we should never get here, because generation should already stopped in process_token()
                    slot.release();
@ -1845,7 +1963,7 @@ struct server_context {
        int32_t batch_type = batch.n_tokens > 0 ? 0 : -1;
        // next, batch any pending prompts without exceeding n_batch
-        if (params.cont_batching || batch.n_tokens == 0) {
+        if (params_base.cont_batching || batch.n_tokens == 0) {
            for (auto & slot : slots) {
                // this slot still has a prompt to be processed
                if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_STARTED) {
@ -1898,7 +2016,7 @@ struct server_context {
                                continue;
                            }
                        } else {
-                            if (!params.ctx_shift) {
+                            if (!params_base.ctx_shift) {
                                // if context shift is disabled, we make sure prompt size is smaller than KV size
                                // TODO: there should be a separate parameter that control prompt truncation
                                //       context shift should be applied only during the generation phase
@ -1941,14 +2059,14 @@ struct server_context {
                            if (slot.params.cache_prompt) {
                                // reuse any previously computed tokens that are common with the new prompt
-                                slot.n_past = longest_common_prefix(slot.cache_tokens, prompt_tokens);
+                                slot.n_past = common_lcp(slot.cache_tokens, prompt_tokens);
                                // reuse chunks from the cached prompt by shifting their KV cache in the new position
-                                if (params.n_cache_reuse > 0) {
+                                if (params_base.n_cache_reuse > 0) {
                                    size_t head_c = slot.n_past; // cache
                                    size_t head_p = slot.n_past; // current prompt
-                                    SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params.n_cache_reuse, slot.n_past);
+                                    SLT_DBG(slot, "trying to reuse chunks with size > %d, slot.n_past = %d\n", params_base.n_cache_reuse, slot.n_past);
                                    while (head_c < slot.cache_tokens.size() &&
                                           head_p < prompt_tokens.size()) {
@ -1961,7 +2079,7 @@ struct server_context {
                                            n_match++;
                                        }
-                                        if (n_match >= (size_t) params.n_cache_reuse) {
+                                        if (n_match >= (size_t) params_base.n_cache_reuse) {
                                            SLT_INF(slot, "reusing chunk with size %zu, shifting KV cache [%zu, %zu) -> [%zu, %zu)\n", n_match, head_c, head_c + n_match, head_p, head_p + n_match);
                                            //for (size_t i = head_p; i < head_p + n_match; i++) {
                                            //    SLT_DBG(slot, "cache token %3zu: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
@ -2149,38 +2267,99 @@ struct server_context {
                    continue; // continue loop of slots
                }
-                completion_token_output result;
+                llama_token id;
                const llama_token id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
-                common_sampler_accept(slot.smpl, id, true);
+                {
                    completion_token_output result;
-                slot.n_decoded += 1;
+                    id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
-                if (slot.n_decoded == 1) {
+
-                    slot.t_start_generation = ggml_time_us();
+                    slot.i_batch = -1;
-                    slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3;
+
-                    metrics.on_prompt_eval(slot);
+                    common_sampler_accept(slot.smpl, id, true);
                    slot.n_decoded += 1;
                    if (slot.n_decoded == 1) {
                        slot.t_start_generation = ggml_time_us();
                        slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3;
                        metrics.on_prompt_eval(slot);
                    }
                    result.tok = id;
                    const auto * cur_p = common_sampler_get_candidates(slot.smpl);
                    for (size_t i = 0; i < (size_t) slot.params.sampling.n_probs; ++i) {
                        result.probs.push_back({
                            cur_p->data[i].id,
                                i >= cur_p->size ? 0.0f : cur_p->data[i].p,
                        });
                    }
                    if (!process_token(result, slot)) {
                        // release slot because of stop condition
                        slot.release();
                        slot.print_timings();
                        send_final_response(slot);
                        metrics.on_prediction(slot);
                        continue;
                    }
                }
-                result.tok = id;
+                // check if the slot supports speculative decoding
-
+                if (!slot.can_speculate()) {
-                const auto * cur_p = common_sampler_get_candidates(slot.smpl);
+                    continue;
                for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
                    result.probs.push_back({
                        cur_p->data[i].id,
                        i >= cur_p->size ? 0.0f : cur_p->data[i].p,
                    });
                }
-                if (!process_token(result, slot)) {
+                struct common_speculative_params params_spec;
-                    // release slot because of stop condition
+                params_spec.n_draft   = slot.params.speculative.n_max;
-                    slot.release();
+                params_spec.n_reuse   = llama_n_ctx(slot.ctx_dft) - slot.params.speculative.n_max;
-                    slot.print_timings();
+                params_spec.p_min     = slot.params.speculative.p_min;
-                    send_final_response(slot);
+
-                    metrics.on_prediction(slot);
+                llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, slot.cache_tokens, id);
                // ignore small drafts
                if (slot.params.speculative.n_min > (int) draft.size()) {
                    continue;
                }
-                slot.i_batch = -1;
+                // construct the speculation batch
                common_batch_clear(slot.batch_spec);
                common_batch_add  (slot.batch_spec, id, slot.n_past, { slot.id }, true);
                for (size_t i = 0; i < draft.size(); ++i) {
                    common_batch_add(slot.batch_spec, draft[i], slot.n_past + 1 + i, { slot.id }, true);
                }
                llama_decode(ctx, slot.batch_spec);
                // the accepted tokens from the speculation
                const auto ids = common_sampler_sample_and_accept_n(slot.smpl, ctx, draft);
                slot.n_past    += ids.size();
                slot.n_decoded += ids.size();
                slot.cache_tokens.push_back(id);
                slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
                llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1);
                for (size_t i = 0; i < ids.size(); ++i) {
                    completion_token_output result;
                    result.tok = ids[i];
                    if (!process_token(result, slot)) {
                        // release slot because of stop condition
                        slot.release();
                        slot.print_timings();
                        send_final_response(slot);
                        metrics.on_prediction(slot);
                        break;
                    }
                }
                SRV_DBG("accepted %d/%d draft tokens\n", (int) ids.size() - 1, (int) draft.size());
            }
        }
@ -2254,6 +2433,16 @@ int main(int argc, char ** argv) {
    LOG_INF("%s\n", common_params_get_system_info(params).c_str());
    LOG_INF("\n");
    // static files
    std::map<std::string, server_static_file> static_files = {
        { "/",                        { index_html,              index_html_len,              "text/html; charset=utf-8" }},
        { "/completion.js",           { completion_js,           completion_js_len,           "text/javascript; charset=utf-8" }},
        { "/deps_daisyui.min.css",    { deps_daisyui_min_css,    deps_daisyui_min_css_len,    "text/css; charset=utf-8" }},
        { "/deps_markdown-it.js",     { deps_markdown_it_js,     deps_markdown_it_js_len,     "text/javascript; charset=utf-8" }},
        { "/deps_tailwindcss.js",     { deps_tailwindcss_js,     deps_tailwindcss_js_len,     "text/javascript; charset=utf-8" }},
        { "/deps_vue.esm-browser.js", { deps_vue_esm_browser_js, deps_vue_esm_browser_js_len, "text/javascript; charset=utf-8" }},
    };
    std::unique_ptr<httplib::Server> svr;
 #ifdef CPPHTTPLIB_OPENSSL_SUPPORT
    if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
@ -2334,7 +2523,7 @@ int main(int argc, char ** argv) {
    // Middlewares
    //
-    auto middleware_validate_api_key = [&params, &res_error](const httplib::Request & req, httplib::Response & res) {
+    auto middleware_validate_api_key = [&params, &res_error, &static_files](const httplib::Request & req, httplib::Response & res) {
        static const std::unordered_set<std::string> public_endpoints = {
            "/health",
            "/models",
@ -2346,8 +2535,8 @@ int main(int argc, char ** argv) {
            return true;
        }
-        // If path is public, skip validation
+        // If path is public or is static file, skip validation
-        if (public_endpoints.find(req.path) != public_endpoints.end()) {
+        if (public_endpoints.find(req.path) != public_endpoints.end() || static_files.find(req.path) != static_files.end()) {
            return true;
        }
@ -2668,7 +2857,7 @@ int main(int argc, char ** argv) {
    const auto handle_props = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
        json data = {
            { "default_generation_settings", ctx_server.default_generation_settings_for_props },
-            { "total_slots",                 ctx_server.params.n_parallel },
+            { "total_slots",                 ctx_server.params_base.n_parallel },
            { "chat_template",               llama_get_chat_template(ctx_server.model) },
        };
@ -2676,7 +2865,7 @@ int main(int argc, char ** argv) {
    };
    const auto handle_props_change = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
-        if (!ctx_server.params.endpoint_props) {
+        if (!ctx_server.params_base.endpoint_props) {
            res_error(res, format_error_response("This server does not support changing global properties. Start it with `--props`", ERROR_TYPE_NOT_SUPPORTED));
            return;
        }
@ -2689,7 +2878,7 @@ int main(int argc, char ** argv) {
    };
    const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_inf_type inf_type, json & data, httplib::Response & res) {
-        if (ctx_server.params.embedding) {
+        if (ctx_server.params_base.embedding) {
            res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
            return;
        }
@ -2795,7 +2984,7 @@ int main(int argc, char ** argv) {
    // TODO: maybe merge this function with "handle_completions_generic"
    const auto handle_chat_completions = [&ctx_server, &params, &res_error, &res_ok, verbose](const httplib::Request & req, httplib::Response & res) {
-        if (ctx_server.params.embedding) {
+        if (ctx_server.params_base.embedding) {
            res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
            return;
        }
@ -2972,7 +3161,7 @@ int main(int argc, char ** argv) {
    };
    const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
-        if (!ctx_server.params.reranking || ctx_server.params.embedding) {
+        if (!ctx_server.params_base.reranking || ctx_server.params_base.embedding) {
            res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking` and without `--embedding`", ERROR_TYPE_NOT_SUPPORTED));
            return;
        }
@ -3091,13 +3280,6 @@ int main(int argc, char ** argv) {
        res.status = 200; // HTTP OK
    };
    auto handle_static_file = [](unsigned char * content, size_t len, const char * mime_type) {
        return [content, len, mime_type](const httplib::Request &, httplib::Response & res) {
            res.set_content(reinterpret_cast<const char*>(content), len, mime_type);
            return false;
        };
    };
    //
    // Router
    //
@ -3112,12 +3294,13 @@ int main(int argc, char ** argv) {
        }
    } else {
        // using embedded static files
-        svr->Get("/",                        handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
+        for (const auto & it : static_files) {
-        svr->Get("/completion.js",           handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
+            const server_static_file & static_file = it.second;
-        svr->Get("/deps_daisyui.min.css",    handle_static_file(deps_daisyui_min_css, deps_daisyui_min_css_len, "text/css; charset=utf-8"));
+            svr->Get(it.first.c_str(), [&static_file](const httplib::Request &, httplib::Response & res) {
-        svr->Get("/deps_markdown-it.js",     handle_static_file(deps_markdown_it_js, deps_markdown_it_js_len, "text/javascript; charset=utf-8"));
+                res.set_content(reinterpret_cast<const char*>(static_file.data), static_file.size, static_file.mime_type);
-        svr->Get("/deps_tailwindcss.js",     handle_static_file(deps_tailwindcss_js, deps_tailwindcss_js_len, "text/javascript; charset=utf-8"));
+                return false;
-        svr->Get("/deps_vue.esm-browser.js", handle_static_file(deps_vue_esm_browser_js, deps_vue_esm_browser_js_len, "text/javascript; charset=utf-8"));
+            });
        }
    }
    // register API routes
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@ -24,7 +24,6 @@
 #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
 using json = nlohmann::ordered_json;
 using llama_tokens = std::vector<llama_token>;
 #define SLT_INF(slot, fmt, ...) LOG_INF("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
 #define SLT_WRN(slot, fmt, ...) LOG_WRN("slot %12.*s: id %2d | task %d | " fmt, 12, __func__, (slot).id, (slot).id_task, __VA_ARGS__)
@ -439,62 +438,6 @@ static std::string gen_chatcmplid() {
 // other common utils
 //
 static size_t longest_common_prefix(const llama_tokens & a, const llama_tokens & b) {
    size_t i;
    for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
    return i;
 }
 static size_t longest_common_subsequence(const llama_tokens & a, const llama_tokens & b) {
    // check for empty sequences
    if (a.empty() || b.empty()) {
        return 0;
    }
    // get the lengths of the input sequences
    size_t a_len = a.size();
    size_t b_len = b.size();
    // initialize the maximum length of the longest common subsequence (LCS)
    size_t max_length = 0;
    // use two rows instead of a 2D matrix to optimize space
    std::vector<size_t> prev_row(b_len + 1, 0);
    std::vector<size_t> curr_row(b_len + 1, 0);
    // iterate through the elements of a
    for (size_t i = 1; i <= a_len; i++) {
        // iterate through the elements of b
        for (size_t j = 1; j <= b_len; j++) {
            // if elements at the current positions match
            if (a[i - 1] == b[j - 1]) {
                // if it's the first element of either sequences, set LCS length to 1
                if (i == 1 || j == 1) {
                    curr_row[j] = 1;
                } else {
                    // increment LCS length by 1 compared to the previous element
                    curr_row[j] = prev_row[j - 1] + 1;
                }
                // update max_length if necessary
                if (curr_row[j] > max_length) {
                    max_length = curr_row[j];
                }
            } else {
                // reset LCS length if elements don't match
                curr_row[j] = 0;
            }
        }
        // update the previous row for the next iteration
        prev_row = curr_row;
    }
    // return the maximum length of the LCS
    return max_length;
 }
 static bool ends_with(const std::string & str, const std::string & suffix) {
    return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
 }
--- a/examples/simple-chat/simple-chat.cpp
+++ b/examples/simple-chat/simple-chat.cpp
@ -62,6 +62,9 @@ int main(int argc, char ** argv) {
        }
    }, nullptr);
    // load dynamic backends
    ggml_backend_load_all();
    // initialize the model
    llama_model_params model_params = llama_model_default_params();
    model_params.n_gpu_layers = ngl;
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@ -74,6 +74,10 @@ int main(int argc, char ** argv) {
        }
    }
    // load dynamic backends
    ggml_backend_load_all();
    // initialize the model
    llama_model_params model_params = llama_model_default_params();
--- a/examples/speculative-simple/CMakeLists.txt
+++ b/examples/speculative-simple/CMakeLists.txt
@ -0,0 +1,5 @@
 set(TARGET llama-speculative-simple)
 add_executable(${TARGET} speculative-simple.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/speculative-simple/README.md
+++ b/examples/speculative-simple/README.md
@ -0,0 +1,12 @@
 # llama.cpp/examples/speculative-simple
 Demonstration of basic greedy speculative decoding
 ```bash
 ./bin/llama-speculative-simple \
    -m  ../models/qwen2.5-32b-coder-instruct/ggml-model-q8_0.gguf \
    -md ../models/qwen2.5-1.5b-coder-instruct/ggml-model-q4_0.gguf \
    -f test.txt -c 0 -ngl 99 --color \
    --sampling-seq k --top-k 1 -fa --temp 0.0 \
    -ngld 99 --draft-max 16 --draft-min 5 --draft-p-min 0.9
 ```
--- a/examples/speculative-simple/speculative-simple.cpp
+++ b/examples/speculative-simple/speculative-simple.cpp
@ -0,0 +1,274 @@
 #include "arg.h"
 #include "common.h"
 #include "sampling.h"
 #include "speculative.h"
 #include "log.h"
 #include "llama.h"
 #include <cstdio>
 #include <cstring>
 #include <string>
 #include <vector>
 int main(int argc, char ** argv) {
    common_params params;
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
        return 1;
    }
    if (params.n_predict < -1) {
        LOG_ERR("%s: --n-predict must be >= -1\n", __func__);
        return 1;
    }
    common_init();
    if (params.speculative.model.empty()) {
        LOG_ERR("%s: --model-draft is required\n", __func__);
        return 1;
    }
    // init llama.cpp
    llama_backend_init();
    llama_numa_init(params.numa);
    llama_model * model_tgt = NULL;
    llama_model * model_dft = NULL;
    llama_context * ctx_tgt = NULL;
    llama_context * ctx_dft = NULL;
    // load the target model
    common_init_result llama_init_tgt = common_init_from_params(params);
    model_tgt = llama_init_tgt.model;
    ctx_tgt   = llama_init_tgt.context;
    // load the draft model
    params.devices      = params.speculative.devices;
    params.model        = params.speculative.model;
    params.n_ctx        = params.speculative.n_ctx;
    params.n_batch      = params.speculative.n_ctx > 0 ? params.speculative.n_ctx : params.n_batch;
    params.n_gpu_layers = params.speculative.n_gpu_layers;
    if (params.speculative.cpuparams.n_threads > 0) {
        params.cpuparams.n_threads = params.speculative.cpuparams.n_threads;
    }
    params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
    common_init_result llama_init_dft = common_init_from_params(params);
    model_dft = llama_init_dft.model;
    ctx_dft   = llama_init_dft.context;
    if (!common_speculative_are_compatible(ctx_tgt, ctx_dft)) {
        return 1;
    }
    // Tokenize the prompt
    std::vector<llama_token> inp;
    inp = common_tokenize(ctx_tgt, params.prompt, true, true);
    if (llama_n_ctx(ctx_tgt) < (int) inp.size()) {
        LOG_ERR("%s: the prompt exceeds the context size (%d tokens, ctx %d)\n", __func__, (int) inp.size(), llama_n_ctx(ctx_tgt));
        return 1;
    }
    if (llama_n_batch(ctx_tgt) < (int) inp.size()) {
        LOG_ERR("%s: the prompt exceeds the batch size (%d tokens, batch %d)\n", __func__, (int) inp.size(), llama_n_batch(ctx_tgt));
        return 1;
    }
    LOG("\n\n");
    for (auto id : inp) {
        LOG("%s", common_token_to_piece(ctx_tgt, id).c_str());
    }
    // how many tokens to draft each time
    int n_draft     = params.speculative.n_max;
    int n_draft_min = params.speculative.n_min;
    float p_min = params.speculative.p_min;
    int n_predict = 0;
    int n_drafted = 0;
    int n_accept  = 0;
    // used to determine end of generation
    bool has_eos = false;
    // ================================================
    // everything until here is standard initialization
    // the relevant stuff for speculative decoding starts here
    const auto t_enc_start = ggml_time_us();
    // target model sampling context
    struct common_sampler * smpl = common_sampler_init(model_tgt, params.sampling);
    // eval the prompt
    llama_decode(ctx_tgt, llama_batch_get_one(inp.data(), inp.size() - 1));
    // note: keep the last token separate!
    llama_token id_last = inp.back();
    // all tokens currently in the target context
    auto prompt_tgt = std::vector<llama_token>(inp.begin(), inp.end() - 1);
    int n_past = inp.size() - 1;
    // init the speculator
    struct common_speculative_params params_spec;
    params_spec.n_draft = n_draft;
    params_spec.n_reuse = llama_n_ctx(ctx_dft) - n_draft;
    params_spec.p_min   = p_min;
    struct common_speculative * spec = common_speculative_init(ctx_dft);
    llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, 1);
    const auto t_enc_end = ggml_time_us();
    const auto t_dec_start = ggml_time_us();
    while (true) {
        // optionally, generate draft tokens that can be appended to the target batch
        //
        // this is the most important part of the speculation. the more probable tokens that are provided here
        // the better the performance will be. in theory, this computation can be performed asynchronously and even
        // offloaded to a remote device. it doesn't even have to be based on an LLM. instead, it can provide tokens
        // from a cache or lookup tables.
        //
        llama_tokens draft = common_speculative_gen_draft(spec, params_spec, prompt_tgt, id_last);
        //LOG_DBG("draft: %s\n", string_from(ctx_dft, draft).c_str());
        // always have a token to evaluate from before - id_last
        common_batch_clear(batch_tgt);
        common_batch_add  (batch_tgt, id_last, n_past++, { 0 }, true);
        // evaluate the target model on [id_last, draft0, draft1, ..., draftN-1]
        {
            // do not waste time on small drafts
            if (draft.size() < n_draft_min) {
                draft.clear();
            }
            for (size_t i = 0; i < draft.size(); ++i) {
                common_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
            }
            //LOG_DBG("target batch: %s\n", string_from(ctx_tgt, batch_tgt).c_str());
            llama_decode(ctx_tgt, batch_tgt);
        }
        // sample from the full target batch and return the accepted tokens based on the target sampler
        //
        // for each token to be accepted, the sampler would have to sample that same token
        // in such cases, instead of decoding the sampled token as we normally do, we simply continue with the
        // available logits from the batch and sample the next token until we run out of logits or the sampler
        // disagrees with the draft
        //
        const auto ids = common_sampler_sample_and_accept_n(smpl, ctx_tgt, draft);
        //LOG_DBG("ids: %s\n", string_from(ctx_tgt, ids).c_str());
        GGML_ASSERT(ids.size() > 0); // there will always be at least one accepted token
        n_past    += ids.size() - 1;
        n_drafted += batch_tgt.n_tokens - 1;
        n_accept  += ids.size() - 1;
        // process the accepted tokens and update contexts
        //
        // this is the standard token post-processing that we normally do
        // in this case, we do it for a group of accepted tokens at once
        //
        {
            llama_token id;
            std::string token_str;
            for (size_t i = 0; i < ids.size(); ++i) {
                id = ids[i];
                ++n_predict;
                if (llama_token_is_eog(model_tgt, id)) {
                    has_eos = true;
                    break;
                }
                token_str = common_token_to_piece(ctx_tgt, id);
                if (params.use_color && i + 1 < ids.size()) {
                    LOG("\u001b[%dm%s\u001b[37m", (36 - 0 % 6), token_str.c_str());
                } else {
                    LOG("%s", token_str.c_str());
                }
            }
            if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
                break;
            }
            LOG_DBG("accepted %d/%d draft tokens, the last target token is: (%d, '%s')\n", (int) ids.size() - 1, (int) draft.size(), id, token_str.c_str());
            {
                LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
                llama_kv_cache_seq_rm(ctx_tgt, 0, n_past, -1);
            }
            prompt_tgt.push_back(id_last);
            prompt_tgt.insert(prompt_tgt.end(), ids.begin(), ids.end() - 1);
            // remember the last accepted token for the next iteration
            id_last = id;
        }
    }
    auto t_dec_end = ggml_time_us();
    const int n_input = inp.size();
    LOG("\n\n");
    LOG_INF("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
    LOG_INF("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
    LOG_INF("\n");
    LOG_INF("n_draft   = %d\n", n_draft);
    LOG_INF("n_predict = %d\n", n_predict);
    LOG_INF("n_drafted = %d\n", n_drafted);
    LOG_INF("n_accept  = %d\n", n_accept);
    LOG_INF("accept    = %.3f%%\n", 100.0f * n_accept / n_drafted);
    LOG_INF("\n");
    LOG_INF("draft:\n\n");
    llama_perf_context_print(ctx_dft);
    LOG_INF("\n");
    LOG_INF("target:\n\n");
    common_perf_print(ctx_tgt, smpl);
    common_sampler_free(smpl);
    common_speculative_free(spec);
    llama_free(ctx_tgt);
    llama_free_model(model_tgt);
    llama_free(ctx_dft);
    llama_free_model(model_dft);
    llama_backend_free();
    LOG("\n\n");
    return 0;
 }
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@ -12,7 +12,7 @@
 #include <string>
 #include <vector>
-#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  100
+#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE  128
 #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
 struct seq_draft {
@ -33,7 +33,7 @@ int main(int argc, char ** argv) {
    common_params params;
    // needed to get candidate probs even for temp <= 0.0
-    params.sparams.n_probs = 128;
+    params.sampling.n_probs = 128;
    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
        return 1;
@ -46,7 +46,7 @@ int main(int argc, char ** argv) {
    common_init();
-    if (params.model_draft.empty()) {
+    if (params.speculative.model.empty()) {
        LOG_ERR("%s: --model-draft is required\n", __func__);
        return 1;
    }
@ -55,9 +55,9 @@ int main(int argc, char ** argv) {
    const int n_seq_dft = params.n_parallel;
    // probability threshold for splitting a draft branch (only for n_seq_dft > 1)
-    const float p_split  = params.p_split;
+    const float p_draft_split = params.speculative.p_split;
-    std::default_random_engine rng(params.sparams.seed == LLAMA_DEFAULT_SEED ? std::random_device()() : params.sparams.seed);
+    std::default_random_engine rng(params.sampling.seed == LLAMA_DEFAULT_SEED ? std::random_device()() : params.sampling.seed);
    std::uniform_real_distribution<> u_dist;
    // init llama.cpp
@ -76,13 +76,14 @@ int main(int argc, char ** argv) {
    ctx_tgt = llama_init_tgt.context;
    // load the draft model
-    params.model = params.model_draft;
+    params.devices = params.speculative.devices;
-    params.n_gpu_layers = params.n_gpu_layers_draft;
+    params.model = params.speculative.model;
-    if (params.draft_cpuparams.n_threads > 0) {
+    params.n_gpu_layers = params.speculative.n_gpu_layers;
-        params.cpuparams.n_threads = params.draft_cpuparams.n_threads;
+    if (params.speculative.cpuparams.n_threads > 0) {
        params.cpuparams.n_threads = params.speculative.cpuparams.n_threads;
    }
-    params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads;
+    params.cpuparams_batch.n_threads = params.speculative.cpuparams_batch.n_threads;
    common_init_result llama_init_dft = common_init_from_params(params);
    model_dft = llama_init_dft.model;
    ctx_dft = llama_init_dft.context;
@ -170,7 +171,7 @@ int main(int argc, char ** argv) {
    //GGML_ASSERT(n_vocab == llama_n_vocab(model_dft));
    // how many tokens to draft each time
-    int n_draft = params.n_draft;
+    int n_draft = params.speculative.n_max;
    int n_predict = 0;
    int n_drafted = 0;
@ -183,14 +184,14 @@ int main(int argc, char ** argv) {
    bool has_eos = false;
    // target model sampling context (reuse the llama_context's sampling instance)
-    struct common_sampler * smpl = common_sampler_init(model_tgt, params.sparams);
+    struct common_sampler * smpl = common_sampler_init(model_tgt, params.sampling);
    // draft sequence data
    std::vector<seq_draft> drafts(n_seq_dft);
    for (int s = 0; s < n_seq_dft; ++s) {
        // allocate llama_sampler for each draft sequence
-        drafts[s].smpl = common_sampler_init(model_dft, params.sparams);
+        drafts[s].smpl = common_sampler_init(model_dft, params.sampling);
    }
    llama_batch batch_dft = llama_batch_init(llama_n_batch(ctx_dft), 0, 1);
@ -230,7 +231,7 @@ int main(int argc, char ** argv) {
            // for stochastic sampling, attempt to match the token with the drafted tokens
            {
                bool accept = false;
-                if (params.sparams.temp > 0) {
+                if (params.sampling.temp > 0) {
                    // stochastic verification
                    common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true);
@ -267,11 +268,12 @@ int main(int argc, char ** argv) {
                        for (size_t i = 0; i < dist_tgt.size; i++) {
                            if (dist_tgt.data[i].id == drafts[s].tokens[i_dft]) {
                                p_tgt = dist_tgt.data[i].p;
                                break;
                            }
                        }
                        for (size_t i = 0; i < dist_dft.size; i++) {
                            if (dist_dft.data[i].id == drafts[s].tokens[i_dft]) {
                                p_dft = dist_dft.data[i].p;
                            }
                            if (p_tgt && p_dft) {
                                break;
                            }
                        }
@ -493,7 +495,7 @@ int main(int argc, char ** argv) {
                // attempt to split the branch if the probability is high enough
                for (int f = 1; f < 8; ++f) {
-                    if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_split) {
+                    if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_draft_split) {
                        LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur);
                        llama_kv_cache_seq_rm(ctx_dft,    n_seq_cur, -1, -1);
--- a/flake.lock
+++ b/flake.lock
@ -20,11 +20,11 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1730200266,
+        "lastModified": 1732014248,
-        "narHash": "sha256-l253w0XMT8nWHGXuXqyiIC/bMvh1VRszGXgdpQlfhvU=",
+        "narHash": "sha256-y/MEyuJ5oBWrWAic/14LaIr/u5E0wRVzyYsouYY3W6w=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "807e9154dcb16384b1b765ebe9cd2bba2ac287fd",
+        "rev": "23e89b7da85c3640bbc2173fe04f4bd114342367",
        "type": "github"
      },
      "original": {
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@ -33,6 +33,7 @@ else()
 endif()
 option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
 option(GGML_BACKEND_DL   "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
 #
 # option list
@ -92,6 +93,7 @@ else()
 endif()
 option(GGML_CPU_HBM     "ggml: use memkind for CPU HBM" OFF)
 option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
 option(GGML_AVX         "ggml: enable AVX"              ${INS_ENB})
 option(GGML_AVX2        "ggml: enable AVX2"             ${INS_ENB})
@ -108,6 +110,7 @@ if (NOT MSVC)
 endif()
 option(GGML_LASX        "ggml: enable lasx"             ON)
 option(GGML_LSX         "ggml: enable lsx"              ON)
 option(GGML_RVV         "ggml: enable rvv"              ON)
 option(GGML_SVE         "ggml: enable SVE"              OFF)
 if (WIN32)
@ -116,6 +119,7 @@ endif()
 # ggml core
 set(GGML_SCHED_MAX_COPIES  "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
 option(GGML_CPU                             "ggml: enable CPU backend"                        ON)
 # 3rd party libs / backends
 option(GGML_ACCELERATE                      "ggml: enable Accelerate framework"               ON)
@ -126,14 +130,9 @@ option(GGML_LLAMAFILE                       "ggml: use LLAMAFILE"
 option(GGML_CUDA                            "ggml: use CUDA"                                  OFF)
 option(GGML_MUSA                            "ggml: use MUSA"                                  OFF)
 option(GGML_CUDA_FORCE_DMMV                 "ggml: use dmmv instead of mmvq CUDA kernels"     OFF)
 option(GGML_CUDA_FORCE_MMQ                  "ggml: use mmq kernels instead of cuBLAS"         OFF)
 option(GGML_CUDA_FORCE_CUBLAS               "ggml: always use cuBLAS instead of mmq kernels"  OFF)
 set   (GGML_CUDA_DMMV_X   "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels")
 set   (GGML_CUDA_MMV_Y     "1" CACHE STRING "ggml: y block size for mmv CUDA kernels")
 option(GGML_CUDA_F16                        "ggml: use 16 bit floats for some calculations"   OFF)
 set   (GGML_CUDA_KQUANTS_ITER "2" CACHE STRING
                                            "ggml: iters./thread per block for Q2_K/Q6_K")
 set   (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
                                            "ggml: max. batch size for using peer access")
 option(GGML_CUDA_NO_PEER_COPY               "ggml: do not use peer to peer copies"            OFF)
@ -141,7 +140,7 @@ option(GGML_CUDA_NO_VMM                     "ggml: do not try to use CUDA VMM"
 option(GGML_CUDA_FA_ALL_QUANTS              "ggml: compile all quants for FlashAttention"     OFF)
 option(GGML_CUDA_GRAPHS                     "ggml: use CUDA graphs (llama.cpp only)"          ${GGML_CUDA_GRAPHS_DEFAULT})
-option(GGML_HIPBLAS                         "ggml: use hipBLAS"                               OFF)
+option(GGML_HIP                             "ggml: use HIP"                                   OFF)
 option(GGML_HIP_UMA                         "ggml: use HIP unified memory architecture"       OFF)
 option(GGML_VULKAN                          "ggml: use Vulkan"                                OFF)
 option(GGML_VULKAN_CHECK_RESULTS            "ggml: run Vulkan op checks"                      OFF)
@ -153,6 +152,7 @@ option(GGML_VULKAN_VALIDATE                 "ggml: enable Vulkan validation"
 option(GGML_VULKAN_RUN_TESTS                "ggml: run Vulkan tests"                          OFF)
 option(GGML_KOMPUTE                         "ggml: use Kompute"                               OFF)
 option(GGML_METAL                           "ggml: use Metal"                                 ${GGML_METAL_DEFAULT})
 option(GGML_METAL_USE_BF16                  "ggml: use bfloat if available"                   OFF)
 option(GGML_METAL_NDEBUG                    "ggml: disable Metal debugging"                   OFF)
 option(GGML_METAL_SHADER_DEBUG              "ggml: compile Metal with -fno-fast-math"         OFF)
 option(GGML_METAL_EMBED_LIBRARY             "ggml: embed Metal library"                       ${GGML_METAL})
@ -166,6 +166,8 @@ option(GGML_SYCL                            "ggml: use SYCL"
 option(GGML_SYCL_F16                        "ggml: use 16 bit floats for sycl calculations"   OFF)
 set   (GGML_SYCL_TARGET "INTEL" CACHE STRING
                                            "ggml: sycl target device")
 set   (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
                                            "ggml: sycl device architecture")
 # extra artifacts
 option(GGML_BUILD_TESTS    "ggml: build tests"    ${GGML_STANDALONE})
@ -225,6 +227,7 @@ set(GGML_PUBLIC_HEADERS
    include/ggml-cann.h
    include/ggml-cuda.h
    include/ggml-kompute.h
    include/ggml-opt.h
    include/ggml-metal.h
    include/ggml-rpc.h
    include/ggml-sycl.h
@ -234,15 +237,14 @@ set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
 #if (GGML_METAL)
 #    set_target_properties(ggml PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/src/ggml-metal.metal")
 #endif()
-install(TARGETS ggml PUBLIC_HEADER)
+install(TARGETS ggml LIBRARY PUBLIC_HEADER)
-
+install(TARGETS ggml-base LIBRARY)
 if (BUILD_SHARED_LIBS)
    install(TARGETS ggml LIBRARY)
 endif()
 # FIXME: this should be done in the backend cmake files
 if (GGML_METAL)
    # FIXME: does this need to be installed with GGML_METAL_EMBED_LIBRARY?
    install(
-        FILES src/ggml-metal.metal
+        FILES src/ggml-metal/ggml-metal.metal
        PERMISSIONS
            OWNER_READ
            OWNER_WRITE
--- a/ggml/include/ggml-amx.h
+++ b/ggml/include/ggml-amx.h
@ -9,16 +9,16 @@ extern "C" {
 #endif
 // buffer_type API
-GGML_API ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
-GGML_API bool ggml_backend_is_amx(ggml_backend_t backend);
+GGML_BACKEND_API bool ggml_backend_is_amx(ggml_backend_t backend);
 // backend API
-GGML_API ggml_backend_t ggml_backend_amx_init(void);
+GGML_BACKEND_API ggml_backend_t ggml_backend_amx_init(void);
-GGML_API void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads);
+GGML_BACKEND_API void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads);
-GGML_API ggml_backend_reg_t ggml_backend_amx_reg(void);
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_amx_reg(void);
 #ifdef  __cplusplus
 }
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@ -3,6 +3,20 @@
 #include "ggml.h"
 #include "ggml-alloc.h"
 #ifdef GGML_BACKEND_SHARED
 #    if defined(_WIN32) && !defined(__MINGW32__)
 #        ifdef GGML_BACKEND_BUILD
 #            define GGML_BACKEND_API __declspec(dllexport) extern
 #        else
 #            define GGML_BACKEND_API __declspec(dllimport) extern
 #        endif
 #    else
 #        define GGML_BACKEND_API __attribute__ ((visibility ("default"))) extern
 #    endif
 #else
 #    define GGML_BACKEND_API extern
 #endif
 #ifdef  __cplusplus
 extern "C" {
 #endif
@ -72,7 +86,7 @@ extern "C" {
    GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
    GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-    // "offset" refers to the offset of the tensor data for setting/getting data
+    // "offset" refers to the offset in tensor->data for setting/getting data
    GGML_API void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
    GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
    GGML_API void ggml_backend_tensor_memset(   struct ggml_tensor * tensor,     uint8_t value, size_t offset, size_t size);
@ -176,6 +190,14 @@ extern "C" {
    typedef void                         (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
    // Get additional buffer types provided by the device (returns a NULL-terminated array)
    typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
    // Set the abort callback for the backend
    typedef void                         (*ggml_backend_set_abort_callback_t)(ggml_backend_t backend, ggml_abort_callback abort_callback, void * abort_callback_data);
    // Get a list of feature flags supported by the backend (returns a NULL-terminated array)
    struct ggml_backend_feature {
        const char * name;
        const char * value;
    };
    typedef struct ggml_backend_feature * (*ggml_backend_get_features_t)(ggml_backend_reg_t reg);
    //
    // Backend registry
@ -200,6 +222,13 @@ extern "C" {
    // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
    GGML_API ggml_backend_t ggml_backend_init_best(void);
    // Load a backend from a dynamic library and register it
    GGML_API ggml_backend_reg_t ggml_backend_load(const char * path);
    // Unload a backend if loaded dynamically and unregister it
    GGML_API void               ggml_backend_unload(ggml_backend_reg_t reg);
    // Load all known backends from dynamic libraries
    GGML_API void               ggml_backend_load_all(void);
    //
    // Backend scheduler
    //
@ -228,14 +257,20 @@ extern "C" {
        ggml_backend_sched_reserve(sched, reserve_graph);
        // compute
-        graph = build_graph(sched);
+        graph = build_graph(sched); // the graph and its tensors are single-use in terms of allocation, multi-use in terms of computation
-        ggml_backend_sched_graph_compute(sched, graph);
+        for (int i = 0; i < 10; ++i) {
            ggml_backend_sched_graph_compute(sched, graph); // on the first iteration the graph is allocated automatically
        }
        // if there are graph inputs:
-        ggml_backend_sched_reset(sched);
+        graph = build_graph(sched); // get a new graph that is not allocated (the metadata for the old graph is freed once ggml_free is called)
-        ggml_backend_sched_alloc_graph(sched, graph);
+        ggml_backend_sched_reset(sched); // clear the allocation of the previous graph
-        ggml_backend_tensor_set(input_tensor, ...);
+        ggml_backend_sched_alloc_graph(sched, graph); // explicitly allocate the new graph but do not execute it
-        ggml_backend_sched_graph_compute(sched, graph);
+        ggml_backend_tensor_set(input_tensor, ...); // copy data to the newly allocated graph tensors
        ggml_backend_sched_graph_compute(sched, graph); // execute the graph
        // as an alternative to the above it is also possible to assign the inputs to a dedicated context and
        // allocate them statically via ggml_backend_alloc_ctx_tensors
    }
    */
@ -250,7 +285,7 @@ extern "C" {
    //
    typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
-    // Initialize a backend scheduler
+    // Initialize a backend scheduler, backends with low index are given priority over backends with high index
    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
    GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);
@ -275,7 +310,9 @@ extern "C" {
    GGML_API enum ggml_status     ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
    GGML_API void                 ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
-    // Reset all assignments and allocators - must be called before changing the node backends
+    // Reset all assignments and allocators - must be called before changing the node backends or allocating a new graph.
    // This in effect deallocates all tensors that were previously allocated and leaves them with dangling pointers.
    // The correct way to use this API is to discard the deallocated tensors and create new ones.
    GGML_API void                 ggml_backend_sched_reset(ggml_backend_sched_t sched);
    // Set a callback to be called for each resulting node during graph compute
--- a/ggml/include/ggml-blas.h
+++ b/ggml/include/ggml-blas.h
@ -9,15 +9,15 @@ extern "C" {
 #endif
 // backend API
-GGML_API ggml_backend_t ggml_backend_blas_init(void);
+GGML_BACKEND_API ggml_backend_t ggml_backend_blas_init(void);
-GGML_API bool ggml_backend_is_blas(ggml_backend_t backend);
+GGML_BACKEND_API bool ggml_backend_is_blas(ggml_backend_t backend);
 // number of threads used for conversion to float
 // for openblas and blis, this will also set the number of threads used for blas operations
-GGML_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
+GGML_BACKEND_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
-GGML_API ggml_backend_reg_t ggml_backend_blas_reg(void);
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_blas_reg(void);
 #ifdef  __cplusplus
--- a/ggml/include/ggml-cann.h
+++ b/ggml/include/ggml-cann.h
@ -34,7 +34,7 @@ extern "C" {
 */
 #define GGML_CANN_MAX_DEVICES 16
-GGML_API ggml_backend_reg_t ggml_backend_cann_reg(void);
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cann_reg(void);
 /**
 * @brief Initializes the CANN backend for a specified device.
@ -46,7 +46,7 @@ GGML_API ggml_backend_reg_t ggml_backend_cann_reg(void);
 * @param device The index of the device to initialize.
 * @return A pointer to the initialized backend instance, or nullptr on failure.
 */
-GGML_API ggml_backend_t ggml_backend_cann_init(int32_t device);
+GGML_BACKEND_API ggml_backend_t ggml_backend_cann_init(int32_t device);
 /**
 * @brief Checks if a given backend is a CANN backend.
@ -57,7 +57,7 @@ GGML_API ggml_backend_t ggml_backend_cann_init(int32_t device);
 * @param backend The backend instance to check.
 * @return True if the backend is a CANN backend, false otherwise.
 */
-GGML_API bool ggml_backend_is_cann(ggml_backend_t backend);
+GGML_BACKEND_API bool ggml_backend_is_cann(ggml_backend_t backend);
 /**
 * @brief Retrieves the CANN buffer type for a specified device.
@ -69,7 +69,7 @@ GGML_API bool ggml_backend_is_cann(ggml_backend_t backend);
 * @return A pointer to the buffer type interface for the specified device, or
 * nullptr if the device index is out of range.
 */
-GGML_API ggml_backend_buffer_type_t
+GGML_BACKEND_API ggml_backend_buffer_type_t
 ggml_backend_cann_buffer_type(int32_t device);
 /**
@ -80,14 +80,14 @@ ggml_backend_cann_buffer_type(int32_t device);
 *
 * @return The number of CANN devices available.
 */
-GGML_API int32_t ggml_backend_cann_get_device_count(void);
+GGML_BACKEND_API int32_t ggml_backend_cann_get_device_count(void);
 /**
 * @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
 *
 * @return A pointer to the host buffer type interface.
 */
-GGML_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
 /**
 * @brief Retrieves the description of a specific CANN device.
@ -99,7 +99,7 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
 * @param description Pointer to a buffer where the description will be written.
 * @param description_size Size of the description buffer.
 */
-GGML_API void ggml_backend_cann_get_device_description(
+GGML_BACKEND_API void ggml_backend_cann_get_device_description(
    int32_t device, char* description, size_t description_size);
 /**
@ -114,7 +114,7 @@ GGML_API void ggml_backend_cann_get_device_description(
 * @param total Pointer to a variable where the total memory size will be
 * stored.
 */
-GGML_API void ggml_backend_cann_get_device_memory(int32_t device,
+GGML_BACKEND_API void ggml_backend_cann_get_device_memory(int32_t device,
                                                  size_t* free,
                                                  size_t* total);
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@ -7,29 +7,6 @@
 extern "C" {
 #endif
    // Scheduling priorities
    enum ggml_sched_priority {
        GGML_SCHED_PRIO_NORMAL,
        GGML_SCHED_PRIO_MEDIUM,
        GGML_SCHED_PRIO_HIGH,
        GGML_SCHED_PRIO_REALTIME
    };
    // Threadpool params
    // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
    struct ggml_threadpool_params {
        bool                cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
        int                 n_threads;                   // number of threads
        enum ggml_sched_priority prio;                   // thread priority
        uint32_t            poll;                        // polling level (0 - no polling, 100 - aggressive polling)
        bool                strict_cpu;                  // strict cpu placement
        bool                paused;                      // start in paused state
    };
    struct ggml_threadpool;     // forward declaration, see ggml.c
    typedef struct ggml_threadpool * ggml_threadpool_t;
    // the compute plan that needs to be prepared for ggml_graph_compute()
    // since https://github.com/ggerganov/ggml/issues/287
    struct ggml_cplan {
@ -54,54 +31,74 @@ extern "C" {
        GGML_NUMA_STRATEGY_COUNT
    };
-    GGML_API void    ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
+    GGML_BACKEND_API void    ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
-    GGML_API bool    ggml_is_numa(void); // true if init detected that system has >1 NUMA node
+    GGML_BACKEND_API bool    ggml_is_numa(void); // true if init detected that system has >1 NUMA node
-    GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
+    GGML_BACKEND_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
-    GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
+    GGML_BACKEND_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
-    GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
+    GGML_BACKEND_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
-    GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
+    GGML_BACKEND_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
-    GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
+    GGML_BACKEND_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
-    GGML_API void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
+    GGML_BACKEND_API void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
-    GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
+    GGML_BACKEND_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
-    GGML_API void    ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
+    GGML_BACKEND_API void    ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
-    GGML_API float   ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
+    GGML_BACKEND_API float   ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
-    GGML_API void    ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
+    GGML_BACKEND_API void    ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
-    GGML_API float   ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
+    GGML_BACKEND_API float   ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
-    GGML_API void    ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
+    GGML_BACKEND_API void    ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
-    GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
+    GGML_BACKEND_API struct ggml_threadpool *      ggml_threadpool_new           (struct ggml_threadpool_params  * params);
-    GGML_API void                          ggml_threadpool_params_init   (struct ggml_threadpool_params * p, int n_threads);
+    GGML_BACKEND_API void                          ggml_threadpool_free          (struct ggml_threadpool * threadpool);
-    GGML_API bool                          ggml_threadpool_params_match  (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
+    GGML_BACKEND_API int                           ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool);
-    GGML_API struct ggml_threadpool *      ggml_threadpool_new          (struct ggml_threadpool_params  * params);
+    GGML_BACKEND_API void                          ggml_threadpool_pause         (struct ggml_threadpool * threadpool);
-    GGML_API void                          ggml_threadpool_free         (struct ggml_threadpool * threadpool);
+    GGML_BACKEND_API void                          ggml_threadpool_resume        (struct ggml_threadpool * threadpool);
    GGML_API int                           ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
    GGML_API void                          ggml_threadpool_pause        (struct ggml_threadpool * threadpool);
    GGML_API void                          ggml_threadpool_resume       (struct ggml_threadpool * threadpool);
    // ggml_graph_plan() has to be called before ggml_graph_compute()
    // when plan.work_size > 0, caller must allocate memory for plan.work_data
-    GGML_API struct ggml_cplan ggml_graph_plan(
+    GGML_BACKEND_API struct ggml_cplan ggml_graph_plan(
                  const struct ggml_cgraph * cgraph,
                                       int   n_threads, /* = GGML_DEFAULT_N_THREADS */
                    struct ggml_threadpool * threadpool /* = NULL */ );
-    GGML_API enum ggml_status  ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+    GGML_BACKEND_API enum ggml_status  ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
    // same as ggml_graph_compute() but the work data is allocated as a part of the context
    // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
-    GGML_API enum ggml_status  ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
+    GGML_BACKEND_API enum ggml_status  ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
-    // TODO: move to backend interface
+    //
-    GGML_API int ggml_cpu_has_neon       (void);
+    // system info
-    GGML_API int ggml_cpu_has_sve        (void);
+    //
-    GGML_API int ggml_cpu_has_matmul_int8(void);
+
-    // get the sve vector length in bytes
+    // x86
-    GGML_API int ggml_cpu_get_sve_cnt(void);
+    GGML_BACKEND_API int ggml_cpu_has_sse3       (void);
    GGML_BACKEND_API int ggml_cpu_has_ssse3      (void);
    GGML_BACKEND_API int ggml_cpu_has_avx        (void);
    GGML_BACKEND_API int ggml_cpu_has_avx_vnni   (void);
    GGML_BACKEND_API int ggml_cpu_has_avx2       (void);
    GGML_BACKEND_API int ggml_cpu_has_f16c       (void);
    GGML_BACKEND_API int ggml_cpu_has_fma        (void);
    GGML_BACKEND_API int ggml_cpu_has_avx512     (void);
    GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void);
    GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void);
    GGML_BACKEND_API int ggml_cpu_has_avx512_bf16(void);
    GGML_BACKEND_API int ggml_cpu_has_amx_int8   (void);
    // ARM
    GGML_BACKEND_API int ggml_cpu_has_neon       (void);
    GGML_BACKEND_API int ggml_cpu_has_arm_fma    (void);
    GGML_BACKEND_API int ggml_cpu_has_fp16_va    (void);
    GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void);
    GGML_BACKEND_API int ggml_cpu_has_sve        (void);
    GGML_BACKEND_API int ggml_cpu_get_sve_cnt    (void);  // sve vector length in bytes
    // other
    GGML_BACKEND_API int ggml_cpu_has_riscv_v    (void);
    GGML_BACKEND_API int ggml_cpu_has_vsx        (void);
    GGML_BACKEND_API int ggml_cpu_has_wasm_simd  (void);
    GGML_BACKEND_API int ggml_cpu_has_llamafile  (void);
    // Internal types and functions exposed for tests and benchmarks
@ -115,6 +112,7 @@ extern "C" {
                                       const void * GGML_RESTRICT y, int nr, int nc);
    struct ggml_type_traits_cpu {
        ggml_from_float_t        from_float;
        ggml_from_float_to_mat_t from_float_to_mat;
        ggml_vec_dot_t           vec_dot;
        enum ggml_type           vec_dot_type;
@ -124,27 +122,30 @@ extern "C" {
        ggml_gemm_t              gemm;
    };
-    GGML_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
+    GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
-    GGML_API void ggml_cpu_init(void);
+    GGML_BACKEND_API void ggml_cpu_init(void);
    //
    // CPU backend
    //
-    GGML_API ggml_backend_t ggml_backend_cpu_init(void);
+    GGML_BACKEND_API ggml_backend_t ggml_backend_cpu_init(void);
-    GGML_API bool ggml_backend_is_cpu                (ggml_backend_t backend);
+    GGML_BACKEND_API bool ggml_backend_is_cpu                (ggml_backend_t backend);
-    GGML_API void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
+    GGML_BACKEND_API void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
-    GGML_API void ggml_backend_cpu_set_threadpool    (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
+    GGML_BACKEND_API void ggml_backend_cpu_set_threadpool    (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
-    GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
+    GGML_BACKEND_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
-    GGML_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
+    GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
 #ifdef GGML_USE_CPU_HBM
-    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
+    GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
 #endif
    GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
    GGML_BACKEND_API bool ggml_backend_cpu_buft_is_aarch64(ggml_backend_buffer_type_t buft);
 #ifdef __cplusplus
 }
 #endif
--- a/ggml/include/ggml-cuda.h
+++ b/ggml/include/ggml-cuda.h
@ -7,7 +7,7 @@
 extern "C" {
 #endif
-#ifdef GGML_USE_HIPBLAS
+#ifdef GGML_USE_HIP
 #define GGML_CUDA_NAME "ROCm"
 #define GGML_CUBLAS_NAME "hipBLAS"
 #elif defined(GGML_USE_MUSA)
@ -20,27 +20,27 @@ extern "C" {
 #define GGML_CUDA_MAX_DEVICES       16
 // backend API
-GGML_API ggml_backend_t ggml_backend_cuda_init(int device);
+GGML_BACKEND_API ggml_backend_t ggml_backend_cuda_init(int device);
-GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
+GGML_BACKEND_API bool ggml_backend_is_cuda(ggml_backend_t backend);
 // device buffer
-GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
 // split tensor buffer that splits matrices by rows across multiple devices
-GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);
 // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
-GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
-GGML_API int  ggml_backend_cuda_get_device_count(void);
+GGML_BACKEND_API int  ggml_backend_cuda_get_device_count(void);
-GGML_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
+GGML_BACKEND_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
-GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
+GGML_BACKEND_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
-GGML_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
+GGML_BACKEND_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
-GGML_API void ggml_backend_cuda_unregister_host_buffer(void * buffer);
+GGML_BACKEND_API void ggml_backend_cuda_unregister_host_buffer(void * buffer);
-GGML_API ggml_backend_reg_t ggml_backend_cuda_reg(void);
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cuda_reg(void);
 #ifdef  __cplusplus
 }
--- a/ggml/include/ggml-kompute.h
+++ b/ggml/include/ggml-kompute.h
@ -37,13 +37,13 @@ struct ggml_vk_device ggml_vk_current_device(void);
 // forward declaration
 typedef struct ggml_backend * ggml_backend_t;
-GGML_API ggml_backend_t ggml_backend_kompute_init(int device);
+GGML_BACKEND_API ggml_backend_t ggml_backend_kompute_init(int device);
-GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend);
+GGML_BACKEND_API bool ggml_backend_is_kompute(ggml_backend_t backend);
-GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
-GGML_API ggml_backend_reg_t ggml_backend_kompute_reg(void);
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_kompute_reg(void);
 #ifdef __cplusplus
 }
--- a/ggml/include/ggml-metal.h
+++ b/ggml/include/ggml-metal.h
@ -39,27 +39,27 @@ extern "C" {
 // user-code should use only these functions
 //
-GGML_API ggml_backend_t ggml_backend_metal_init(void);
+GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void);
-GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
+GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend);
 GGML_DEPRECATED(
-        GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
+        GGML_BACKEND_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
        "obsoleted by the new device interface - https://github.com/ggerganov/llama.cpp/pull/9713");
-GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
+GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
-GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
 // helper to check if the device supports a specific family
 // ideally, the user code should be doing these checks
 // ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
-GGML_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
+GGML_BACKEND_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
 // capture all command buffers committed the next time `ggml_backend_graph_compute` is called
-GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
+GGML_BACKEND_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
-GGML_API ggml_backend_reg_t ggml_backend_metal_reg(void);
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_metal_reg(void);
 #ifdef __cplusplus
 }
--- a/ggml/include/ggml-opt.h
+++ b/ggml/include/ggml-opt.h
@ -0,0 +1,216 @@
 // This file contains functionality for training models using GGML.
 // It is not strictly needed vs. just vanilla GGML but it provides a more high-level interface for common needs such as datasets.
 // At the bottom of this file especially there are relatively high-level functions that are suitable use or adaptation in user code.
 //
 // Module maintainer: Johannes Gäßler (@JohannesGaessler, johannesg@5d6.de)
 #pragma once
 #include "ggml.h"
 #include "ggml-backend.h"
 #include <stdint.h>
 #ifdef  __cplusplus
 extern "C" {
 #endif
    struct ggml_opt_dataset;
    struct ggml_opt_context;
    struct ggml_opt_result;
    typedef struct ggml_opt_dataset * ggml_opt_dataset_t;
    typedef struct ggml_opt_context * ggml_opt_context_t;
    typedef struct ggml_opt_result  * ggml_opt_result_t;
    // ====== Loss ======
    // built-in loss types, i.e. the built-in quantities minimized by the optimizer
    // custom loss types can be defined via mean or sum which simply reduce the outputs for all datapoints to a single value
    enum ggml_opt_loss_type {
        GGML_OPT_LOSS_TYPE_MEAN,
        GGML_OPT_LOSS_TYPE_SUM,
        GGML_OPT_LOSS_TYPE_CROSS_ENTROPY,
        GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR,
    };
    // ====== Dataset ======
    GGML_API ggml_opt_dataset_t ggml_opt_dataset_init(
            int64_t ne_datapoint, // number of elements per datapoint
            int64_t ne_label,     // number of elements per label
            int64_t ndata,        // total number of datapoints/labels
            int64_t ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied)
    GGML_API void ggml_opt_dataset_free(ggml_opt_dataset_t dataset);
    // get underlying tensors that store the data
    GGML_API struct ggml_tensor * ggml_opt_dataset_data  (ggml_opt_dataset_t dataset); // shape = [ne_datapoint, ndata]
    GGML_API struct ggml_tensor * ggml_opt_dataset_labels(ggml_opt_dataset_t dataset); // shape = [nd_label,     ndata]
    // shuffle idata first datapoints from dataset with RNG from opt_ctx, shuffle all datapoints if idata is negative
    GGML_API void ggml_opt_dataset_shuffle(ggml_opt_context_t opt_ctx, ggml_opt_dataset_t dataset, int64_t idata);
    // get batch at position ibatch from dataset and copy the data to data_batch and labels_batch
    GGML_API void ggml_opt_dataset_get_batch(
            ggml_opt_dataset_t   dataset,
            struct ggml_tensor * data_batch,   // shape = [ne_datapoint, ndata_batch]
            struct ggml_tensor * labels_batch, // shape = [ne_label,     ndata_batch]
            int64_t              ibatch);
    // ====== Model / Context ======
    enum ggml_opt_build_type {
        GGML_OPT_BUILD_TYPE_FORWARD,
        GGML_OPT_BUILD_TYPE_GRAD,
        GGML_OPT_BUILD_TYPE_OPT,
    };
    // parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
    struct ggml_opt_optimizer_params {
        // AdamW optimizer parameters
        struct {
            float alpha; // learning rate
            float beta1;
            float beta2;
            float eps;   // epsilon for numerical stability
            float wd;    // weight decay for AdamW, use 0.0f to disable
        } adamw;
    };
    // callback to calculate optimizer parameters prior to a backward pass
    // userdata can be used to pass arbitrary data
    typedef struct ggml_opt_optimizer_params (*ggml_opt_get_optimizer_params)(void * userdata);
    // returns the default optimizer params (constant)
    // userdata is not used
    GGML_API struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata);
    // parameters for initializing a new optimization context
    struct ggml_opt_params {
        ggml_backend_sched_t backend_sched; // defines which backends are used to construct the compute graphs
        struct ggml_context * ctx_compute; // created in user code, holds non-static tensors
        // the forward graph is defined by inputs and outputs
        // those tensors and all tensors inbetween are not intended to be reusable between multiple optimization contexts
        struct ggml_tensor * inputs;
        struct ggml_tensor * outputs;
        enum ggml_opt_loss_type  loss_type;
        enum ggml_opt_build_type build_type;
        int32_t opt_period; // after how many gradient accumulation steps an optimizer step should be done
        ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
        void * get_opt_pars_ud;                     // userdata for calculating optimizer parameters
    };
    // get parameters for an optimization context with defaults set where possible
    // parameters for which no sensible defaults exist are supplied as arguments to this function
    GGML_API ggml_opt_params ggml_opt_default_params(
            ggml_backend_sched_t      backend_sched,
            struct ggml_context     * ctx_compute,
            struct ggml_tensor      * inputs,
            struct ggml_tensor      * outputs,
            enum ggml_opt_loss_type   loss_type);
    GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params);
    GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx);
    // set gradients to zero, initilize loss, and optionally reset the optimizer
    GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer);
    // get underlying tensors that store data
    GGML_API struct ggml_tensor * ggml_opt_inputs(  ggml_opt_context_t opt_ctx); // forward graph input tensor
    GGML_API struct ggml_tensor * ggml_opt_outputs( ggml_opt_context_t opt_ctx); // forward graph output tensor
    GGML_API struct ggml_tensor * ggml_opt_labels(  ggml_opt_context_t opt_ctx); // labels to compare outputs against
    GGML_API struct ggml_tensor * ggml_opt_loss(    ggml_opt_context_t opt_ctx); // scalar tensor that contains the loss
    GGML_API struct ggml_tensor * ggml_opt_pred(    ggml_opt_context_t opt_ctx); // predictions made by outputs
    GGML_API struct ggml_tensor * ggml_opt_ncorrect(ggml_opt_context_t opt_ctx); // number of matching predictions between outputs and labels
    GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);
    // ====== Optimization Result ======
    GGML_API ggml_opt_result_t ggml_opt_result_init();
    GGML_API void ggml_opt_result_free(ggml_opt_result_t result);
    GGML_API void ggml_opt_result_reset(ggml_opt_result_t result);
    // get data from result, uncertainties are optional and can be ignored by passing NULL
    GGML_API void ggml_opt_result_ndata(   ggml_opt_result_t result, int64_t * ndata);                  // writes 1 value, number of datapoints
    GGML_API void ggml_opt_result_loss(    ggml_opt_result_t result, double  * loss,     double * unc); // writes 1 value
    GGML_API void ggml_opt_result_pred(    ggml_opt_result_t result, int32_t * pred);                   // writes ndata values
    GGML_API void ggml_opt_result_accuracy(ggml_opt_result_t result, double  * accuracy, double * unc); // writes 1 value
    // ====== Computation ======
    // do forward pass, increment result if not NULL
    GGML_API void ggml_opt_forward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
    // do forward pass, increment result if not NULL, do backward pass
    GGML_API void ggml_opt_forward_backward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
    // ############################################################################
    // ## The high-level functions start here. They do not depend on any private ##
    // ## functions or structs and can be copied to and adapted for user code.   ##
    // ############################################################################
    // ====== Intended Usage ======
    //
    // 1. Select the appropriate loss for your problem.
    // 2. Create a dataset and set the data for the "data" tensor. Also set the "labels" tensor if your loss needs them.
    //    Setting the shard size to 1 will be fine, it's the granularity with which data is shuffled/loaded (bigger values are faster).
    // 3. Create a GGML graph for your model with no_alloc == true. Use two separate contexts for the tensors.
    //    The first context should contain the model parameters and inputs and be allocated statically in user code.
    //    The second context should contain all other tensors and will be (re)allocated automatically.
    //    Due to this automated allocation the data of the second context is not defined when accessed in user code.
    //    Note that the second dimension of the inputs/outputs are interpreted as the number of datapoints in those tensors.
    // 4. Call ggml_opt_fit. If you need more control you can use ggml_opt_epoch instead.
    // signature for a callback while evaluating opt_ctx on dataset, called after an evaluation
    typedef void (*ggml_opt_epoch_callback)(
            bool               train,       // true after training evaluation, false after validation evaluation
            ggml_opt_context_t opt_ctx,
            ggml_opt_dataset_t dataset,
            ggml_opt_result_t  result,      // result associated with the dataset subsection
            int64_t            ibatch,      // number of batches that have been evaluated so far
            int64_t            ibatch_max,  // total number of batches in this dataset subsection
            int64_t            t_start_us); // time at which the evaluation on the dataset subsection was started
    // do training on front of dataset, do evaluation only on back of dataset
    GGML_API void ggml_opt_epoch(
            ggml_opt_context_t      opt_ctx,
            ggml_opt_dataset_t      dataset,
            ggml_opt_result_t       result_train,   // result to increment during training, ignored if NULL
            ggml_opt_result_t       result_eval,    // result to increment during evaluation, ignored if NULL
            int64_t                 idata_split,    // data index at which to split training and evaluation
            ggml_opt_epoch_callback callback_train,
            ggml_opt_epoch_callback callback_eval);
    // callback that prints a progress bar on stderr
    GGML_API void ggml_opt_epoch_callback_progress_bar(
            bool               train,
            ggml_opt_context_t opt_ctx,
            ggml_opt_dataset_t dataset,
            ggml_opt_result_t  result,
            int64_t            ibatch,
            int64_t            ibatch_max,
            int64_t            t_start_us);
    // fit model defined by inputs and outputs to dataset
    GGML_API void ggml_opt_fit(
            ggml_backend_sched_t            backend_sched,  // backend scheduler for constructing the compute graphs
            ggml_context                  * ctx_compute,    // context with temporarily allocated tensors to calculate the outputs
            ggml_tensor                   * inputs,         // input tensor with shape [ne_datapoint, ndata_batch]
            ggml_tensor                   * outputs,        // output tensor, must have shape [ne_label, ndata_batch] if labels are used
            ggml_opt_dataset_t              dataset,        // dataset with data and optionally also labels
            enum ggml_opt_loss_type         loss_type,      // loss to minimize
            ggml_opt_get_optimizer_params   get_opt_pars,   // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
            int64_t                         nepoch,         // how many times the dataset should be iterated over
            int64_t                         nbatch_logical, // datapoints optimizer step, must be a multiple of ndata_batch in inputs/outputs
            float                           val_split,      // fraction of the dataset to use for validation, must be in [0.0f, 1.0f)
            bool                            silent);        // whether or not info prints to stderr should be suppressed
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml/include/ggml-rpc.h
+++ b/ggml/include/ggml-rpc.h
@ -10,18 +10,18 @@ extern "C" {
 #define GGML_RPC_MAX_SERVERS       16
 // backend API
-GGML_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
+GGML_BACKEND_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
-GGML_API bool ggml_backend_is_rpc(ggml_backend_t backend);
+GGML_BACKEND_API bool ggml_backend_is_rpc(ggml_backend_t backend);
-GGML_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
-GGML_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
+GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
-GGML_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
+GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
-GGML_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
-GGML_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);
+GGML_BACKEND_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);
 #ifdef  __cplusplus
 }
--- a/ggml/include/ggml-sycl.h
+++ b/ggml/include/ggml-sycl.h
@ -17,32 +17,32 @@ extern "C" {
 #endif
 // backend API
-GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
+GGML_BACKEND_API ggml_backend_t ggml_backend_sycl_init(int device);
-GGML_API bool ggml_backend_is_sycl(ggml_backend_t backend);
+GGML_BACKEND_API bool ggml_backend_is_sycl(ggml_backend_t backend);
 // devide buffer
-GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
 // split tensor buffer that splits matrices by rows across multiple devices
-GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
 // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
-GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
-GGML_API void ggml_backend_sycl_print_sycl_devices(void);
+GGML_BACKEND_API void ggml_backend_sycl_print_sycl_devices(void);
-GGML_API void ggml_backend_sycl_get_gpu_list(int *id_list, int max_len);
+GGML_BACKEND_API void ggml_backend_sycl_get_gpu_list(int *id_list, int max_len);
-GGML_API void ggml_backend_sycl_get_device_description(int device,
+GGML_BACKEND_API void ggml_backend_sycl_get_device_description(int device,
                                                       char *description,
                                                       size_t description_size);
-GGML_API int  ggml_backend_sycl_get_device_count();
+GGML_BACKEND_API int  ggml_backend_sycl_get_device_count();
-GGML_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
+GGML_BACKEND_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
 // SYCL doesn't support registering host memory, keep here for reference
-// GGML_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
+// GGML_BACKEND_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
-// GGML_API void ggml_backend_sycl_unregister_host_buffer(void * buffer);
+// GGML_BACKEND_API void ggml_backend_sycl_unregister_host_buffer(void * buffer);
-GGML_API ggml_backend_reg_t ggml_backend_sycl_reg(void);
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_sycl_reg(void);
 #ifdef  __cplusplus
 }
--- a/ggml/include/ggml-vulkan.h
+++ b/ggml/include/ggml-vulkan.h
@ -10,21 +10,21 @@ extern "C" {
 #define GGML_VK_NAME "Vulkan"
 #define GGML_VK_MAX_DEVICES 16
-GGML_API void ggml_vk_instance_init(void);
+GGML_BACKEND_API void ggml_vk_instance_init(void);
 // backend API
-GGML_API ggml_backend_t ggml_backend_vk_init(size_t dev_num);
+GGML_BACKEND_API ggml_backend_t ggml_backend_vk_init(size_t dev_num);
-GGML_API bool ggml_backend_is_vk(ggml_backend_t backend);
+GGML_BACKEND_API bool ggml_backend_is_vk(ggml_backend_t backend);
-GGML_API int  ggml_backend_vk_get_device_count(void);
+GGML_BACKEND_API int  ggml_backend_vk_get_device_count(void);
-GGML_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
+GGML_BACKEND_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
-GGML_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
+GGML_BACKEND_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
-GGML_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
 // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
-GGML_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
-GGML_API ggml_backend_reg_t ggml_backend_vk_reg(void);
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_vk_reg(void);
 #ifdef  __cplusplus
 }
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -176,15 +176,15 @@
 #ifdef GGML_SHARED
 #    if defined(_WIN32) && !defined(__MINGW32__)
 #        ifdef GGML_BUILD
-#            define GGML_API __declspec(dllexport)
+#            define GGML_API __declspec(dllexport) extern
 #        else
-#            define GGML_API __declspec(dllimport)
+#            define GGML_API __declspec(dllimport) extern
 #        endif
 #    else
-#        define GGML_API __attribute__ ((visibility ("default")))
+#        define GGML_API __attribute__ ((visibility ("default"))) extern
 #    endif
 #else
-#    define GGML_API
+#    define GGML_API extern
 #endif
 // TODO: support for clang
@ -602,7 +602,6 @@ extern "C" {
        int32_t flags;
        struct ggml_tensor * grad;
        struct ggml_tensor * src[GGML_MAX_SRC];
        // source tensor and offset for views
@ -615,7 +614,7 @@ extern "C" {
        void * extra; // extra things e.g. for ggml-cuda.cu
-        // char padding[4];
+        char padding[8];
    };
    static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@ -1490,7 +1489,7 @@ extern "C" {
        "use ggml_rope_ext_inplace instead");
    // compute correction dims for YaRN RoPE scaling
-    void ggml_rope_yarn_corr_dims(
+    GGML_API void ggml_rope_yarn_corr_dims(
        int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]);
    // rotary position embedding backward, i.e compute dx from dy
@ -1746,6 +1745,9 @@ extern "C" {
            struct ggml_tensor * a,
            enum ggml_prec       prec);
    GGML_API enum ggml_prec ggml_flash_attn_ext_get_prec(
            const struct ggml_tensor * a);
    // TODO: needs to be adapted to ggml_flash_attn_ext
    GGML_API struct ggml_tensor * ggml_flash_attn_back(
           struct ggml_context * ctx,
@ -1982,28 +1984,20 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * grad,
-            float                 alpha,
+            struct ggml_tensor  * m,
-            float                 beta1,
+            struct ggml_tensor  * v,
-            float                 beta2,
+            struct ggml_tensor  * adamw_params); // parameters such a the learning rate
            float                 eps,
            float                 wd); // weight decay
    //
    // automatic differentiation
    //
-    GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
+    GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
-    GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate);
+    GGML_API void ggml_build_backward_expand(
-
+        struct ggml_context * ctx_static,  // context for static gradients (loss + gradient accumulation)
-    GGML_API void ggml_build_opt_adamw(
+        struct ggml_context * ctx_compute, // context for gradient computation
-            struct ggml_context * ctx,
+        struct ggml_cgraph  * cgraph,
-            struct ggml_cgraph  * gf,
+        bool                  accumulate); // whether or not gradients should be accumulated, requires static allocation of tensors in ctx_static
            struct ggml_cgraph  * gb,
            float                 alpha,
            float                 beta1,
            float                 beta2,
            float                 eps,
            float                 wd); // weight decay
    // graph allocation in a context
    GGML_API struct ggml_cgraph * ggml_new_graph       (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
@ -2023,7 +2017,9 @@ extern "C" {
    GGML_API size_t ggml_graph_overhead(void);
    GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
-    GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
+    GGML_API struct ggml_tensor * ggml_graph_get_tensor  (const struct ggml_cgraph * cgraph, const char * name);
    GGML_API struct ggml_tensor * ggml_graph_get_grad    (const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
    GGML_API struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
    GGML_API void                 ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
    GGML_API struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
@ -2034,198 +2030,15 @@ extern "C" {
    // dump the graph into a file using the dot format
    GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
-    // build gradient checkpointing backward graph gb for gf using provided checkpoints
+    // TODO these functions were sandwiched in the old optimization interface, is there a better place for them?
    // gb_tmp will contain original backward graph with rewritten backward process nodes,
    // but without the second forward pass nodes.
    GGML_API void ggml_build_backward_gradient_checkpointing(
            struct ggml_context   * ctx,
            struct ggml_cgraph    * gf,
            struct ggml_cgraph    * gb,
            struct ggml_cgraph    * gb_tmp,
            struct ggml_tensor  * * checkpoints,
            int                     n_checkpoints);
    //
    // optimization
    //
    // optimization methods
    enum ggml_opt_type {
        GGML_OPT_TYPE_ADAM,
        GGML_OPT_TYPE_LBFGS,
    };
    // linesearch methods
    enum ggml_linesearch {
        GGML_LINESEARCH_DEFAULT = 1,
        GGML_LINESEARCH_BACKTRACKING_ARMIJO       = 0,
        GGML_LINESEARCH_BACKTRACKING_WOLFE        = 1,
        GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
    };
    // optimization return values
    enum ggml_opt_result {
        GGML_OPT_RESULT_OK = 0,
        GGML_OPT_RESULT_DID_NOT_CONVERGE,
        GGML_OPT_RESULT_NO_CONTEXT,
        GGML_OPT_RESULT_INVALID_WOLFE,
        GGML_OPT_RESULT_FAIL,
        GGML_OPT_RESULT_CANCEL,
        GGML_LINESEARCH_FAIL = -128,
        GGML_LINESEARCH_MINIMUM_STEP,
        GGML_LINESEARCH_MAXIMUM_STEP,
        GGML_LINESEARCH_MAXIMUM_ITERATIONS,
        GGML_LINESEARCH_INVALID_PARAMETERS,
    };
    typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
    typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
    // Set callback for all future logging events.
    // If this is not called, or NULL is supplied, everything is output on stderr.
    GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
    // optimization parameters
    //
    //   see ggml.c (ggml_opt_default_params) for default values
    //
    struct ggml_opt_params {
        enum ggml_opt_type type;
        size_t graph_size;
        int n_threads;
        // delta-based convergence test
        //
        //   if past == 0 - disabled
        //   if past > 0:
        //     stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
        //
        int past;
        float delta;
        // maximum number of iterations without improvement
        //
        //   if 0 - disabled
        //   if > 0:
        //     assume convergence if no cost improvement in this number of iterations
        //
        int max_no_improvement;
        bool print_forward_graph;
        bool print_backward_graph;
        int n_gradient_accumulation;
        // ADAM parameters
        struct {
            int n_iter;
            float sched; // schedule multiplier (fixed, decay or warmup)
            float decay; // weight decay for AdamW, use 0.0f to disable
            int   decay_min_ndim; // minimum number of tensor dimension to apply weight decay
            float alpha; // learning rate
            float beta1;
            float beta2;
            float eps;   // epsilon for numerical stability
            float eps_f; // epsilon for convergence test
            float eps_g; // epsilon for convergence test
            float gclip; // gradient clipping
        } adam;
        // LBFGS parameters
        struct {
            int m; // number of corrections to approximate the inv. Hessian
            int n_iter;
            int max_linesearch;
            float eps;      // convergence tolerance
            float ftol;     // line search tolerance
            float wolfe;
            float min_step;
            float max_step;
            enum ggml_linesearch linesearch;
        } lbfgs;
    };
    struct ggml_opt_context {
        struct ggml_context * ctx;
        struct ggml_opt_params params;
        int iter;
        int64_t nx; // number of parameter elements
        bool just_initialized;
        float loss_before;
        float loss_after;
        struct {
            struct ggml_tensor * g;  // current gradient
            struct ggml_tensor * m;  // first moment
            struct ggml_tensor * v;  // second moment
            struct ggml_tensor * pf; // past function values
            float fx_best;
            float fx_prev;
            int n_no_improvement;
        } adam;
        struct {
            struct ggml_tensor * x;    // current parameters
            struct ggml_tensor * xp;   // previous parameters
            struct ggml_tensor * g;    // current gradient
            struct ggml_tensor * gp;   // previous gradient
            struct ggml_tensor * d;    // search direction
            struct ggml_tensor * pf;   // past function values
            struct ggml_tensor * lmal; // the L-BFGS memory alpha
            struct ggml_tensor * lmys; // the L-BFGS memory ys
            struct ggml_tensor * lms;  // the L-BFGS memory s
            struct ggml_tensor * lmy;  // the L-BFGS memory y
            float fx_best;
            float step;
            int j;
            int k;
            int end;
            int n_no_improvement;
        } lbfgs;
    };
    GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
    GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
    // optimize the function defined by the tensor f
    GGML_API enum ggml_opt_result ggml_opt(
            struct ggml_context * ctx,
            struct ggml_opt_params params,
            struct ggml_tensor * f);
    // initialize optimizer context
    GGML_API void ggml_opt_init(
            struct ggml_context     * ctx,
            struct ggml_opt_context * opt,
            struct ggml_opt_params    params,
            int64_t                   nx);
    // continue optimizing the function defined by the tensor f
    GGML_API enum ggml_opt_result ggml_opt_resume(
            struct ggml_context * ctx,
            struct ggml_opt_context * opt,
            struct ggml_tensor * f);
    // continue optimizing the function defined by the tensor f
    GGML_API enum ggml_opt_result ggml_opt_resume_g(
            struct ggml_context * ctx,
            struct ggml_opt_context * opt,
            struct ggml_tensor * f,
            struct ggml_cgraph * gf,
            struct ggml_cgraph * gb,
            ggml_opt_callback callback,
            void * callback_data);
    //
    // quantization
    //
@ -2381,38 +2194,6 @@ extern "C" {
    GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
    GGML_API void   gguf_get_meta_data(const struct gguf_context * ctx, void * data);
    //
    // system info
    //
    GGML_API int ggml_cpu_has_avx        (void);
    GGML_API int ggml_cpu_has_avx_vnni   (void);
    GGML_API int ggml_cpu_has_avx2       (void);
    GGML_API int ggml_cpu_has_avx512     (void);
    GGML_API int ggml_cpu_has_avx512_vbmi(void);
    GGML_API int ggml_cpu_has_avx512_vnni(void);
    GGML_API int ggml_cpu_has_avx512_bf16(void);
    GGML_API int ggml_cpu_has_amx_int8   (void);
    GGML_API int ggml_cpu_has_fma        (void);
    GGML_API int ggml_cpu_has_arm_fma    (void);
    GGML_API int ggml_cpu_has_metal      (void);
    GGML_API int ggml_cpu_has_f16c       (void);
    GGML_API int ggml_cpu_has_fp16_va    (void);
    GGML_API int ggml_cpu_has_wasm_simd  (void);
    GGML_API int ggml_cpu_has_blas       (void);
    GGML_API int ggml_cpu_has_cuda       (void);
    GGML_API int ggml_cpu_has_vulkan     (void);
    GGML_API int ggml_cpu_has_kompute    (void);
    GGML_API int ggml_cpu_has_gpublas    (void);
    GGML_API int ggml_cpu_has_sse3       (void);
    GGML_API int ggml_cpu_has_ssse3      (void);
    GGML_API int ggml_cpu_has_riscv_v    (void);
    GGML_API int ggml_cpu_has_sycl       (void);
    GGML_API int ggml_cpu_has_rpc        (void);
    GGML_API int ggml_cpu_has_vsx        (void);
    GGML_API int ggml_cpu_has_cann       (void);
    GGML_API int ggml_cpu_has_llamafile  (void);
 #ifdef  __cplusplus
 // restrict not standard in C++
 #define GGML_RESTRICT
@ -2429,12 +2210,42 @@ extern "C" {
        size_t                   type_size;
        bool                     is_quantized;
        ggml_to_float_t          to_float;
        ggml_from_float_t        from_float;
        ggml_from_float_t        from_float_ref;
    };
    GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);
    // ggml threadpool
    // TODO: currently, only a few functions are in the base ggml API, while the rest are in the CPU backend
    // the goal should be to create an API that other backends can use move everything to the ggml base
    // scheduling priorities
    enum ggml_sched_priority {
        GGML_SCHED_PRIO_NORMAL,
        GGML_SCHED_PRIO_MEDIUM,
        GGML_SCHED_PRIO_HIGH,
        GGML_SCHED_PRIO_REALTIME
    };
    // threadpool params
    // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
    struct ggml_threadpool_params {
        bool                cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
        int                 n_threads;                   // number of threads
        enum ggml_sched_priority prio;                   // thread priority
        uint32_t            poll;                        // polling level (0 - no polling, 100 - aggressive polling)
        bool                strict_cpu;                  // strict cpu placement
        bool                paused;                      // start in paused state
    };
    struct ggml_threadpool;     // forward declaration, see ggml.c
    typedef struct ggml_threadpool * ggml_threadpool_t;
    GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
    GGML_API void                          ggml_threadpool_params_init   (struct ggml_threadpool_params * p, int n_threads);
    GGML_API bool                          ggml_threadpool_params_match  (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
--- a/ggml/src/ggml-aarch64.c
+++ b/ggml/src/ggml-aarch64.c
--- a/ggml/src/ggml-aarch64.h
+++ b/ggml/src/ggml-aarch64.h
@ -1,9 +1,5 @@
 // SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
 #pragma once
 #define GGML_COMMON_DECL_C
 #include "ggml-common.h"
 #include "ggml.h"
 // GGML internal header
@ -12,27 +8,11 @@
 extern "C" {
 #endif
 // Quantization
 void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
 // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
 size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 // GEMV
 void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 // GEMM
 void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 #ifdef __cplusplus
 }
 #endif
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@ -466,18 +466,12 @@ static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
    return ggml_gallocr_hash_get(galloc, t)->allocated;
 }
 static void ggml_gallocr_set_node_offset(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, size_t offset) {
    struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
    hn->buffer_id = buffer_id;
    hn->offset = offset;
    hn->allocated = true;
 }
 static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
    return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
 }
 static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
    GGML_ASSERT(buffer_id >= 0);
    struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
    if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
@ -816,7 +810,11 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
 }
 static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
-    size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
+    size_t node_size = 0;
    if (!node->data && !node->view_src) {
        GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API
        node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
    }
    return talloc->size_max >= node_size;
 }
--- a/ggml/src/ggml-amx/CMakeLists.txt
+++ b/ggml/src/ggml-amx/CMakeLists.txt
@ -0,0 +1,105 @@
 if (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
        (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
         CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$") AND
        CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 11.0)
    message(STATUS "Using AMX")
    file(GLOB   GGML_HEADERS_AMX "*.h")
    list(APPEND GGML_HEADERS_AMX "../../include/ggml-amx.h")
    file(GLOB   GGML_SOURCES_AMX "*.cpp")
    ggml_add_backend_library(ggml-amx
                             ${GGML_HEADERS_AMX}
                             ${GGML_SOURCES_AMX}
                            )
    # this is duplicated from the CPU backend, since the AMX backend also depends on the architecture flags
    # TODO: integrate AMX backend into the CPU backend
    if (MSVC)
        # instruction set detection for MSVC only
        if (GGML_NATIVE)
            # TODO: improve, should not reference files from the parent folder
            include(../ggml-cpu/cmake/FindSIMD.cmake)
        endif ()
        if (GGML_AVX512)
            list(APPEND ARCH_FLAGS /arch:AVX512)
            # MSVC has no compile-time flags enabling specific
            # AVX512 extensions, neither it defines the
            # macros corresponding to the extensions.
            # Do it manually.
            if (GGML_AVX512_VBMI)
                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
            endif()
            if (GGML_AVX512_VNNI)
                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
            endif()
            if (GGML_AVX512_BF16)
                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
            endif()
            if (GGML_AMX_TILE)
                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
            endif()
            if (GGML_AMX_INT8)
                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
            endif()
            if (GGML_AMX_BF16)
                add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>)
                add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
            endif()
        elseif (GGML_AVX2)
            list(APPEND ARCH_FLAGS /arch:AVX2)
        elseif (GGML_AVX)
            list(APPEND ARCH_FLAGS /arch:AVX)
        endif()
    else()
        if (GGML_NATIVE)
            list(APPEND ARCH_FLAGS -march=native)
        endif()
        if (GGML_F16C)
            list(APPEND ARCH_FLAGS -mf16c)
        endif()
        if (GGML_FMA)
            list(APPEND ARCH_FLAGS -mfma)
        endif()
        if (GGML_AVX)
            list(APPEND ARCH_FLAGS -mavx)
        endif()
        if (GGML_AVX2)
            list(APPEND ARCH_FLAGS -mavx2)
        endif()
        if (GGML_AVX512)
            list(APPEND ARCH_FLAGS -mavx512f)
            list(APPEND ARCH_FLAGS -mavx512dq)
            list(APPEND ARCH_FLAGS -mavx512bw)
        endif()
        if (GGML_AVX512_VBMI)
            list(APPEND ARCH_FLAGS -mavx512vbmi)
        endif()
        if (GGML_AVX512_VNNI)
            list(APPEND ARCH_FLAGS -mavx512vnni)
        endif()
        if (GGML_AVX512_BF16)
            list(APPEND ARCH_FLAGS -mavx512bf16)
        endif()
        if (GGML_AMX_TILE)
            list(APPEND ARCH_FLAGS -mamx-tile)
        endif()
        if (GGML_AMX_INT8)
            list(APPEND ARCH_FLAGS -mamx-int8)
        endif()
        if (GGML_AMX_BF16)
            list(APPEND ARCH_FLAGS -mamx-bf16)
        endif()
    endif()
    target_compile_options(ggml-amx PRIVATE ${ARCH_FLAGS})
 else()
    set(GGML_AMX OFF PARENT_SCOPE)
    message(WARNING "AMX requires x86 and gcc version > 11.0. Turning off GGML_AMX.")
 endif()
--- a/ggml/src/ggml-amx/common.h
+++ b/ggml/src/ggml-amx/common.h
@ -1,7 +1,8 @@
 #pragma once
 #include "ggml.h"
-#include "ggml-cpu-impl.h" // <immintrin.h>
+// hack until AMX is moved into the CPU backend
 #include "../ggml-cpu/ggml-cpu-impl.h" // <immintrin.h>
 #include <algorithm>
 #include <memory>
--- a/ggml/src/ggml-amx/ggml-amx.cpp
+++ b/ggml/src/ggml-amx/ggml-amx.cpp
@ -317,8 +317,6 @@ static bool ggml_backend_amx_device_supports_op(ggml_backend_dev_t dev, const st
            const enum ggml_type type = src0->type;
            const int64_t ne0 = op->ne[0];
            bool is_training = src0->grad || src1->grad;
            // amx kernels enables for Q4_0, Q4_1, Q8_0, F16
            // Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256
            bool has_amx_kernels = qtype_has_amx_kernels(type) || (type == GGML_TYPE_F16);
@ -326,7 +324,6 @@ static bool ggml_backend_amx_device_supports_op(ggml_backend_dev_t dev, const st
            bool can_use_amx =
                is_contiguous_2d(src0) &&       // src0 must be contiguous
                is_contiguous_2d(src1) &&       // src1 must be contiguous
                !is_training &&                 // inference only
                src1->type == GGML_TYPE_F32 &&  // src1 must be float32
                has_amx_kernels &&              // with amx kernel impls
                ne0 % (TILE_N * 2) == 0;        // out_features is 32x
@ -412,8 +409,9 @@ static const struct ggml_backend_reg_i ggml_backend_amx_reg_i = {
 ggml_backend_reg_t ggml_backend_amx_reg(void) {
    static struct ggml_backend_reg ggml_backend_amx_reg = {
-        /* .iface   = */ ggml_backend_amx_reg_i,
+        /* .api_version = */ GGML_BACKEND_API_VERSION,
-        /* .context = */ NULL,
+        /* .iface       = */ ggml_backend_amx_reg_i,
        /* .context     = */ NULL,
    };
    return &ggml_backend_amx_reg;
@ -421,9 +419,18 @@ ggml_backend_reg_t ggml_backend_amx_reg(void) {
 #else // if defined(__AMX_INT8__)
 ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void) {
    return nullptr;
 }
 bool ggml_backend_is_amx(ggml_backend_t backend) {
    GGML_UNUSED(backend);
    return false;
 }
 ggml_backend_t ggml_backend_amx_init(void) {
    fprintf(stderr, "GGML is not compiled with AMX support!\n");
-    return ggml_backend_t{};
+    return nullptr;
 }
 void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) {
@ -433,4 +440,10 @@ void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) {
    GGML_UNUSED(n_threads);
 }
 ggml_backend_reg_t ggml_backend_amx_reg(void) {
    return nullptr;
 }
 #endif
 GGML_BACKEND_DL_IMPL(ggml_backend_amx_reg)
--- a/ggml/src/ggml-amx/mmq.cpp
+++ b/ggml/src/ggml-amx/mmq.cpp
@ -496,19 +496,20 @@ inline void from_float(const float * x, char * vy, int64_t k);
 template <>
 inline void from_float<block_q8_0>(const float * x, char * vy, int64_t k) {
-    quantize_row_q8_0(x, vy, k);
+    // FIXME: using unoptimized reference impl until moved to CPU backend
    quantize_row_q8_0_ref(x, (block_q8_0 *)vy, k);
 }
 template <>
 inline void from_float<block_q8_1>(const float * x, char * vy, int64_t k) {
-    quantize_row_q8_1(x, vy, k);
+    quantize_row_q8_1_ref(x, (block_q8_1 *)vy, k);
 }
 template <>
 inline void from_float<block_q8_K>(const float * x, char * vy, int64_t k) {
 #if 1
    // TODO: this is reference impl!
-    quantize_row_q8_K(x, vy, k);
+    quantize_row_q8_K_ref(x, (block_q8_K *)vy, k);
 #else
    quantize_row_q8_K_vnni(x, vy, k);
 #endif
--- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h
@ -8,6 +8,8 @@
 extern "C" {
 #endif
    #define GGML_BACKEND_API_VERSION 1
    //
    // Backend buffer type
    //
@ -63,20 +65,20 @@ extern "C" {
        enum ggml_backend_buffer_usage usage;
    };
-    ggml_backend_buffer_t ggml_backend_buffer_init(
+    GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
                   ggml_backend_buffer_type_t buft,
            struct ggml_backend_buffer_i      iface,
                   void *                     context,
                   size_t                     size);
    // do not use directly, use ggml_backend_tensor_copy instead
-    bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
+    GGML_API bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
    // multi-buffer
    // buffer that contains a collection of buffers
-    ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
+    GGML_API ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
-    bool                  ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
+    GGML_API bool                  ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
-    void                  ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
+    GGML_API void                  ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
    //
    // Backend (stream)
@ -199,17 +201,37 @@ extern "C" {
    };
    struct ggml_backend_reg {
-        // int api_version; // TODO: for dynamic loading
+        int api_version; // initialize to GGML_BACKEND_API_VERSION
        struct ggml_backend_reg_i iface;
        void * context;
    };
    // Internal backend registry API
-    void ggml_backend_register(ggml_backend_reg_t reg);
+    GGML_API void ggml_backend_register(ggml_backend_reg_t reg);
-    void ggml_backend_device_register(ggml_backend_dev_t device);
+    GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
-    // TODO: backends can be loaded as a dynamic library, in which case it needs to export this function
+
-    // typedef ggml_backend_register_t * (*ggml_backend_init)(void);
+    // Add backend dynamic loading support to the backend
    typedef ggml_backend_reg_t (*ggml_backend_init_t)(void);
    #ifdef GGML_BACKEND_DL
        #ifdef __cplusplus
        #    define GGML_BACKEND_DL_IMPL(reg_fn)                                 \
                extern "C" {                                                     \
                    GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
                }                                                                \
                ggml_backend_reg_t ggml_backend_init(void) {                     \
                    return reg_fn();                                             \
                }
        #else
        #    define GGML_BACKEND_DL_IMPL(reg_fn)                             \
                GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
                ggml_backend_reg_t ggml_backend_init(void) {                 \
                    return reg_fn();                                         \
                }
        #endif
    #else
    #    define GGML_BACKEND_DL_IMPL(reg_fn)
    #endif
 #ifdef  __cplusplus
 }
--- a/Show More
+++ b/Show More